Composable kernel init integration v3 (#1097)

* Squashed 'src/composable_kernel/' content from commit f6edda61 git-subtree-dir: src/composable_kernel git-subtree-split: f6edda61 * add solver ConvIgemmFwdV6r1DlopsNchwKcyxNkhw; rename static ck source files * Squashed 'src/composable_kernel/' changes from f6edda61..5781adf5 5781adf5 Update develop (#5) (#6) 97e6d514 Merge pull request #4 from ROCmSoftwarePlatform/separate_online_compile 7b1ec41e refactor 49c33aae refactor 54b3e73d rename git-subtree-dir: src/composable_kernel git-subtree-split: 5781adf5 * fix * refactor * remove online compilation from CK * refactor * fix * add ctest * add c-style pointer cast * vector/scalar pointer cast use c-style pointer cast instead of reinterpret_cast * fix clang warning suppression * tidy * suppress cppcheck * fix enum issue * revert chagnes to hip build * fix kernel filename * update CK build script * rename * rename * make innner product compatiable on gfx900 * Update src/include/miopen/solver/ck_utility_common.hpp Co-authored-by: JD <Jehandad.Khan@amd.com> * compiler parameter use stream * use int instead of index_t in kernel wrapper * DynamicBuffer, StaticBuffer, amd_buffer_load support customized value for invalid element * refactor * refactor * change cmakelist * change ck common utility * fix Co-authored-by: JD <Jehandad.Khan@amd.com>

Composable kernel init integration v3 (#1097)
* Squashed 'src/composable_kernel/' content from commit f6edda61 git-subtree-dir: src/composable_kernel git-subtree-split: f6edda61 * add solver ConvIgemmFwdV6r1DlopsNchwKcyxNkhw; rename static ck source files * Squashed 'src/composable_kernel/' changes from f6edda61..5781adf5 5781adf5 Update develop (#5) (#6) 97e6d514 Merge pull request #4 from ROCmSoftwarePlatform/separate_online_compile 7b1ec41e refactor 49c33aae refactor 54b3e73d rename git-subtree-dir: src/composable_kernel git-subtree-split: 5781adf5 * fix * refactor * remove online compilation from CK * refactor * fix * add ctest * add c-style pointer cast * vector/scalar pointer cast use c-style pointer cast instead of reinterpret_cast * fix clang warning suppression * tidy * suppress cppcheck * fix enum issue * revert chagnes to hip build * fix kernel filename * update CK build script * rename * rename * make innner product compatiable on gfx900 * Update src/include/miopen/solver/ck_utility_common.hpp Co-authored-by: JD <Jehandad.Khan@amd.com> * compiler parameter use stream * use int instead of index_t in kernel wrapper * DynamicBuffer, StaticBuffer, amd_buffer_load support customized value for invalid element * refactor * refactor * change cmakelist * change ck common utility * fix Co-authored-by: JD <Jehandad.Khan@amd.com>
6fe3627a · Chao Liu · GitHub · 6fe3627a · 6fe3627a · 6fe3627a
Commit 6fe3627a authored Aug 19, 2021 by Chao Liu Committed by GitHub Aug 19, 2021
20 changed files
--- a/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
+#ifndef CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
+#define CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform.hpp"
+
+namespace ck {
+
+template <typename LowLength>
+__host__ __device__ constexpr auto make_pass_through_transform(const LowLength& low_length)
+{
+    return PassThrough<LowLength>{low_length};
+}
+
+template <typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
+__host__ __device__ constexpr auto
+make_pad_transform(const LowLength& low_length,
+                   const LeftPad& left_pad,
+                   const RightPad& right_pad,
+                   integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
+{
+    return Pad<LowLength, LeftPad, RightPad, SkipIsValidCheck>{low_length, left_pad, right_pad};
+}
+
+template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
+__host__ __device__ constexpr auto make_left_pad_transform(
+    const LowLength& low_length,
+    const LeftPadLength& left_pad,
+    integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
+{
+    return LeftPad<LowLength, LeftPadLength, SkipIsValidCheck>{low_length, left_pad};
+}
+
+template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck>
+__host__ __device__ constexpr auto make_right_pad_transform(
+    const LowLength& low_length,
+    const RightPadLength& right_pad,
+    integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
+{
+    return RightPad<LowLength, RightPadLength, SkipIsValidCheck>{low_length, right_pad};
+}
+
+template <typename UpLengths,
+          typename Coefficients,
+          typename enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
+__host__ __device__ constexpr auto make_embed_transform(const UpLengths& up_lengths,
+                                                        const Coefficients& coefficients)
+{
+    return Embed<UpLengths, Coefficients>{up_lengths, coefficients};
+}
+
+template <typename LowLengths>
+__host__ __device__ constexpr auto make_merge_transform(const LowLengths& low_lengths)
+{
+#if !CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
+    return Merge_v1_carry_check<LowLengths>{low_lengths};
+#else
+#if 1
+    return Merge_v2_magic_division<LowLengths>{low_lengths};
+#else
+    return Merge_v2r2_magic_division<LowLengths>{low_lengths};
+#endif
+#endif
+}
+
+template <typename LowLengths>
+__host__ __device__ constexpr auto
+make_merge_transform_v2_magic_division(const LowLengths& low_lengths)
+{
+    return Merge_v2_magic_division<LowLengths>{low_lengths};
+}
+
+template <typename UpLengths, bool Use24BitIntegerCalculation = false>
+__host__ __device__ constexpr auto make_unmerge_transform(
+    const UpLengths& up_lengths,
+    integral_constant<bool, Use24BitIntegerCalculation> = integral_constant<bool, false>{})
+{
+    return UnMerge<UpLengths, Use24BitIntegerCalculation>{up_lengths};
+}
+
+template <typename LowerIndex>
+__host__ __device__ constexpr auto make_freeze_transform(const LowerIndex& low_idx)
+{
+    return Freeze<LowerIndex>{low_idx};
+}
+
+template <typename LowLength, typename SliceBegin, typename SliceEnd>
+__host__ __device__ constexpr auto make_slice_transform(const LowLength& low_length,
+                                                        const SliceBegin& slice_begin,
+                                                        const SliceEnd& slice_end)
+{
+    return Slice<LowLength, SliceBegin, SliceEnd>{low_length, slice_begin, slice_end};
+}
+
+template <typename VectorSize, typename UpLength>
+__host__ __device__ constexpr auto make_vectorize_transform(const VectorSize& vector_size,
+                                                            const UpLength& up_length)
+{
+    return Vectorize<VectorSize, UpLength>{vector_size, up_length};
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/tensor_adaptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_adaptor.hpp
+#ifndef CK_TENSOR_ADAPTOR_HPP
+#define CK_TENSOR_ADAPTOR_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Transforms: Tuple<transforms...>
+// LowerDimensionHiddenIdss : Tuple<Sequence<...>, ...>
+// UpperDimensionHiddenIdss : Tuple<Sequence<...>, ...>
+// BottomDimensionHiddenIds : Sequence<...>
+// TopDimensionHiddenIds : Sequence<...>
+template <typename Transforms,
+          typename LowerDimensionHiddenIdss,
+          typename UpperDimensionHiddenIdss,
+          typename BottomDimensionHiddenIds,
+          typename TopDimensionHiddenIds>
+struct TensorAdaptor
+{
+    __host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); }
+
+    __host__ __device__ constexpr const auto& GetTransforms() const { return transforms_; }
+
+    __host__ __device__ static constexpr auto GetLowerDimensionHiddenIdss()
+    {
+        return LowerDimensionHiddenIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetUpperDimensionHiddenIdss()
+    {
+        return UpperDimensionHiddenIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetTopDimensionHiddenIds()
+    {
+        return TopDimensionHiddenIds{};
+    }
+
+    __host__ __device__ static constexpr auto GetBottomDimensionHiddenIds()
+    {
+        return BottomDimensionHiddenIds{};
+    }
+
+    __host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms)
+    {
+        const auto lengths = generate_tuple(
+            [&](auto idim_top) {
+                constexpr auto tmp = GetTransformAndItsUpperDimension(idim_top);
+
+                constexpr index_t itran   = tmp[Number<0>{}];
+                constexpr index_t idim_up = tmp[Number<1>{}];
+                constexpr bool found      = tmp[Number<2>{}];
+
+                static_assert(found == true,
+                              "wrong! not found matching transformation and upper-dimension");
+
+                const auto length =
+                    transforms[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
+
+                return length;
+            },
+            Number<ndim_top_>{});
+
+        // TODO: make container_reduce support tuple of Number and index_t
+        return container_reduce(lengths, math::multiplies{}, Number<1>{});
+    }
+
+    template <index_t IDim>
+    __host__ __device__ static constexpr auto GetTransformAndItsUpperDimension(Number<IDim>)
+    {
+        constexpr auto idim_top = Number<IDim>{};
+
+        constexpr index_t idim_hidden = TopDimensionHiddenIds::At(idim_top);
+
+        index_t itran_found   = 0;
+        index_t idim_up_found = 0;
+        bool found            = false;
+
+        static_for<0, ntransform_, 1>{}([&](auto itran) {
+            constexpr auto up_dim_ids = UpperDimensionHiddenIdss{}[itran];
+
+            static_for<0, up_dim_ids.Size(), 1>{}([&](auto idim_up) {
+                if constexpr(up_dim_ids[idim_up] == idim_hidden)
+                {
+                    itran_found   = itran;
+                    idim_up_found = idim_up;
+                    found         = true;
+                }
+            });
+        });
+
+        return make_tuple(itran_found, idim_up_found, found);
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfBottomDimension()
+    {
+        return BottomDimensionHiddenIds::Size();
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfTopDimension()
+    {
+        return TopDimensionHiddenIds::Size();
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfHiddenDimension()
+    {
+        constexpr auto all_low_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
+            LowerDimensionHiddenIdss{});
+
+        constexpr auto all_up_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
+            UpperDimensionHiddenIdss{});
+
+        constexpr auto all_dim_ids = merge_sequences(all_low_dim_ids, all_up_dim_ids);
+
+        using unique_sort_all_dim_ids = typename sequence_unique_sort<decltype(all_dim_ids),
+                                                                      math::less<index_t>,
+                                                                      math::equal<index_t>>::type;
+
+        return unique_sort_all_dim_ids::Size();
+    }
+
+    constexpr static index_t ntransform_  = GetNumOfTransform();
+    constexpr static index_t ndim_hidden_ = GetNumOfHiddenDimension();
+    constexpr static index_t ndim_bottom_ = GetNumOfBottomDimension();
+    constexpr static index_t ndim_top_    = GetNumOfTopDimension();
+
+    using HiddenIndex = MultiIndex<ndim_hidden_>;
+    using BottomIndex = MultiIndex<ndim_bottom_>;
+    using TopIndex    = MultiIndex<ndim_top_>;
+
+    // may be index_t or Number<>
+    using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
+
+    public:
+    __host__ __device__ constexpr TensorAdaptor() = default;
+
+    __host__ __device__ constexpr TensorAdaptor(const Transforms& transforms)
+        : transforms_{transforms}, element_size_{InitializeElementSize(transforms)}
+    {
+        static_assert(Transforms::Size() == ntransform_ &&
+                          LowerDimensionHiddenIdss::Size() == ntransform_ &&
+                          UpperDimensionHiddenIdss::Size() == ntransform_,
+                      "wrong! inconsistent # of transformations");
+
+        // TODO check dependency of dimensions is valid
+    }
+
+    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
+
+    template <typename TopIdx>
+    __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+    {
+        static_assert(TopIdx::Size() == TopDimensionHiddenIds::Size(),
+                      "wrong! # of dimension inconsistent");
+
+        constexpr index_t ntransform  = GetNumOfTransform();
+        constexpr index_t ndim_hidden = GetNumOfHiddenDimension();
+
+        MultiIndex<ndim_hidden> idx_hidden;
+
+        // initialize uppest index
+        set_container_subset(idx_hidden, GetTopDimensionHiddenIds(), idx_top);
+
+        // calculate hidden index
+        static_for<ntransform, 0, -1>{}([&](auto itran_p1) {
+            auto itran              = itran_p1 - Number<1>{};
+            const auto& tran        = GetTransforms().At(itran);
+            constexpr auto dims_low = GetLowerDimensionHiddenIdss().At(itran);
+            constexpr auto dims_up  = GetUpperDimensionHiddenIdss().At(itran);
+
+            const auto idx_up = get_container_subset(idx_hidden, dims_up);
+
+            MultiIndex<dims_low.Size()> idx_low;
+
+            tran.CalculateLowerIndex(idx_low, idx_up);
+
+            set_container_subset(idx_hidden, dims_low, idx_low);
+        });
+
+        return get_container_subset(idx_hidden, BottomDimensionHiddenIds{});
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        bool is_known = true;
+
+        static_for<0, Transforms::Size(), 1>{}([&](auto i) {
+            is_known &=
+                remove_cv_t<remove_reference_t<decltype(Transforms{}[i])>>::IsKnownAtCompileTime();
+        });
+
+        return is_known && is_known_at_compile_time<ElementSize>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("TensorAdaptor, ");
+        static_for<0, ntransform_, 1>{}([&](auto i) {
+            printf("transforms: ");
+            transforms_[i].Print();
+            printf("LowerDimensionHiddenIds:");
+            LowerDimensionHiddenIdss{}.At(i).Print();
+            printf("UpperDimensionHiddenIds:");
+            UpperDimensionHiddenIdss{}.At(i).Print();
+        });
+
+        printf("BottomDimensionHiddenIds:");
+        BottomDimensionHiddenIds::Print();
+        printf("TopDimensionHiddenIds:");
+        TopDimensionHiddenIds::Print();
+
+        printf("}");
+    }
+
+    private:
+    Transforms transforms_;
+    ElementSize element_size_;
+};
+
+template <typename TensorAdaptor0, typename TensorAdaptor1>
+__host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0& adaptor0,
+                                                         const TensorAdaptor1& adaptor1)
+{
+    static_assert(TensorAdaptor0::GetNumOfTopDimension() ==
+                      TensorAdaptor1::GetNumOfBottomDimension(),
+                  "wrong!");
+
+    // all_transforms = transform0 + transform1
+    const auto all_transforms =
+        container_concat(adaptor0.GetTransforms(), adaptor1.GetTransforms());
+
+    // shift
+    constexpr index_t adaptor0_max_hidden_id = [&]() {
+        index_t adaptor0_max_hidden_id_ = NumericLimits<index_t>::Min();
+
+        static_for<0, TensorAdaptor0::GetNumOfTransform(), 1>{}([&](auto itran) {
+            constexpr index_t ndim_low =
+                TensorAdaptor0{}.GetTransforms()[itran].GetNumOfLowerDimension();
+
+            static_for<0, ndim_low, 1>{}([&](auto idim_low) {
+                adaptor0_max_hidden_id_ =
+                    math::max(adaptor0_max_hidden_id_,
+                              TensorAdaptor0::GetLowerDimensionHiddenIdss()[itran][idim_low].value);
+            });
+
+            constexpr index_t ndim_up =
+                TensorAdaptor0{}.GetTransforms()[itran].GetNumOfUpperDimension();
+
+            static_for<0, ndim_up, 1>{}([&](auto idim_up) {
+                adaptor0_max_hidden_id_ =
+                    math::max(adaptor0_max_hidden_id_,
+                              TensorAdaptor0::GetUpperDimensionHiddenIdss()[itran][idim_up].value);
+            });
+        });
+
+        return adaptor0_max_hidden_id_;
+    }();
+
+    constexpr index_t adaptor1_min_hidden_id = [&]() {
+        index_t adaptor1_min_hidden_id_ = NumericLimits<index_t>::Max();
+
+        static_for<0, TensorAdaptor1::GetNumOfTransform(), 1>{}([&](auto itran) {
+            constexpr index_t ndim_low =
+                TensorAdaptor1{}.GetTransforms()[itran].GetNumOfLowerDimension();
+
+            // get the min of all lower dimenions, but not bottom dimension (because their id will
+            // be matched with top id from adaptor0)
+            static_for<0, ndim_low, 1>{}([&](auto idim_low) {
+                constexpr index_t low_dim_hidden_id =
+                    TensorAdaptor1::GetLowerDimensionHiddenIdss()[itran][idim_low].value;
+
+                bool is_bottom_dim = false;
+                static_for<0, TensorAdaptor1::GetNumOfBottomDimension(), 1>{}([&](auto i) {
+                    if constexpr(low_dim_hidden_id ==
+                                 TensorAdaptor1::GetBottomDimensionHiddenIds()[i])
+                    {
+                        is_bottom_dim = true;
+                    }
+                });
+
+                if(!is_bottom_dim)
+                {
+                    adaptor1_min_hidden_id_ = math::min(adaptor1_min_hidden_id_, low_dim_hidden_id);
+                }
+            });
+
+            constexpr index_t ndim_up =
+                TensorAdaptor1{}.GetTransforms()[itran].GetNumOfUpperDimension();
+
+            // get the min of all upper dimensions
+            static_for<0, ndim_up, 1>{}([&](auto idim_up) {
+                adaptor1_min_hidden_id_ =
+                    math::min(adaptor1_min_hidden_id_,
+                              TensorAdaptor1::GetUpperDimensionHiddenIdss()[itran][idim_up].value);
+            });
+        });
+
+        return adaptor1_min_hidden_id_;
+    }();
+
+    constexpr index_t adaptor1_hidden_id_shift =
+        adaptor0_max_hidden_id + 1 - adaptor1_min_hidden_id;
+
+    constexpr index_t ndim_bottom_1 = TensorAdaptor1::GetNumOfBottomDimension();
+
+    // all_low_dim_hidden_idss =
+    // low_dim_hidden_idss_0 + match_hidden_id_for_1(shift_hidden_id_for_1(low_dim_hiden_idss_1))
+    constexpr auto low_dim_hidden_idss_1 = generate_tuple(
+        // generate sequence of ids for a transform
+        [&](auto itran) {
+            constexpr auto ndim_low_1 = TensorAdaptor1::GetLowerDimensionHiddenIdss()[itran].Size();
+
+            constexpr auto low_dim_hidden_ids_1 =
+                TensorAdaptor1::GetLowerDimensionHiddenIdss()[itran];
+
+            // sequence in, sequence out
+            constexpr auto low_dim_hidden_ids_1_mod = [&]() constexpr
+            {
+                auto low_dim_hidden_ids_1_mod_ = to_multi_index(low_dim_hidden_ids_1);
+
+                // shift hidden id so every dim id is unique
+                static_for<0, ndim_low_1, 1>{}([&](auto idim_low_1) {
+                    low_dim_hidden_ids_1_mod_(idim_low_1) += adaptor1_hidden_id_shift;
+                });
+
+                // match hidden id
+                static_for<0, ndim_low_1, 1>{}([&](auto idim_low_1) {
+                    static_for<0, ndim_bottom_1, 1>{}([&](auto idim_bottom_1) {
+                        // if this low dim is bottom dim, then do id matching
+                        if constexpr(low_dim_hidden_ids_1[idim_low_1] ==
+                                     TensorAdaptor1::GetBottomDimensionHiddenIds()[idim_bottom_1])
+                        {
+                            low_dim_hidden_ids_1_mod_(idim_low_1) =
+                                TensorAdaptor0::GetTopDimensionHiddenIds()[idim_bottom_1];
+                        }
+                    });
+                });
+
+                return low_dim_hidden_ids_1_mod_;
+            }
+            ();
+
+            return generate_sequence_v2(
+                [&](auto i) constexpr { return Number<low_dim_hidden_ids_1_mod[i]>{}; },
+                Number<ndim_low_1>{});
+        },
+        Number<TensorAdaptor1::GetNumOfTransform()>{});
+
+    constexpr auto all_low_dim_hidden_idss =
+        container_concat(TensorAdaptor0::GetLowerDimensionHiddenIdss(), low_dim_hidden_idss_1);
+
+    // all_up_dim_hidden_idss =
+    // up_dim_hidden_idss_0 + shift_hidden_id_for_1(up_dim_hiden_idss_1)
+    constexpr auto up_dim_hidden_idss_1 = generate_tuple(
+        // generate sequence of ids for a transform
+        [&](auto itran) {
+            constexpr auto ndim_up_1 = TensorAdaptor1::GetUpperDimensionHiddenIdss()[itran].Size();
+
+            constexpr auto up_dim_hidden_ids_1 =
+                TensorAdaptor1::GetUpperDimensionHiddenIdss()[itran];
+
+            // sequence in, constexpr tuple out
+            constexpr auto up_dim_hidden_ids_1_mod = [&]() constexpr
+            {
+                auto up_dim_hidden_ids_1_mod_ = to_multi_index(up_dim_hidden_ids_1);
+
+                // shift hidden id
+                static_for<0, ndim_up_1, 1>{}([&](auto idim_up_1) {
+                    up_dim_hidden_ids_1_mod_(idim_up_1) += adaptor1_hidden_id_shift;
+                });
+
+                return up_dim_hidden_ids_1_mod_;
+            }
+            ();
+
+            // constexpr tuple to sequence
+            return generate_sequence_v2(
+                [&](auto i) constexpr { return Number<up_dim_hidden_ids_1_mod[i]>{}; },
+                Number<ndim_up_1>{});
+        },
+        Number<TensorAdaptor1::GetNumOfTransform()>{});
+
+    constexpr auto all_up_dim_hidden_idss =
+        container_concat(TensorAdaptor0::GetUpperDimensionHiddenIdss(), up_dim_hidden_idss_1);
+
+    // bottom_dim_hidden_ids = bottom_dim_hidden_ids_0
+    constexpr auto bottom_dim_hidden_ids = TensorAdaptor0::GetBottomDimensionHiddenIds();
+
+    // top_dim_hidden_ids = shift_hidden_id(top_dim_hidden_ids_1)
+    constexpr auto top_dim_hidden_ids =
+        TensorAdaptor1::GetTopDimensionHiddenIds() + Number<adaptor1_hidden_id_shift>{};
+
+    // put everything together
+    return TensorAdaptor<remove_cv_t<decltype(all_transforms)>,
+                         remove_cv_t<decltype(all_low_dim_hidden_idss)>,
+                         remove_cv_t<decltype(all_up_dim_hidden_idss)>,
+                         remove_cv_t<decltype(bottom_dim_hidden_ids)>,
+                         remove_cv_t<decltype(top_dim_hidden_ids)>>{all_transforms};
+}
+
+// Transforms: Tuple<transforms...>
+// LowerDimensionOldTopIdss: Tuple<Sequence<...>, ...>
+// UpperDimensionNewTopIdss: Tuple<Sequence<...>, ...>
+template <typename Transforms, typename LowerDimensionOldTopIdss, typename UpperDimensionNewTopIdss>
+__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms& transforms,
+                                                                    LowerDimensionOldTopIdss,
+                                                                    UpperDimensionNewTopIdss)
+{
+    constexpr index_t ntransform = Transforms::Size();
+
+    static_assert(LowerDimensionOldTopIdss::Size() == ntransform &&
+                      UpperDimensionNewTopIdss::Size() == ntransform,
+                  "wrong!");
+
+    // sanity check on LowerDimensionOldTopIdss and UpperDimensionNewTopIdss
+    constexpr auto all_low_dim_old_top_ids = unpack(
+        [](auto&&... xs) constexpr { return merge_sequences(xs...); }, LowerDimensionOldTopIdss{});
+
+    constexpr auto all_up_dim_new_top_ids = unpack(
+        [](auto&&... xs) constexpr { return merge_sequences(xs...); }, UpperDimensionNewTopIdss{});
+
+    static_assert(is_valid_sequence_map<decltype(all_low_dim_old_top_ids)>::value &&
+                      is_valid_sequence_map<decltype(all_up_dim_new_top_ids)>::value,
+                  "wrong!");
+
+    constexpr index_t ndim_old_top = all_low_dim_old_top_ids.Size();
+    constexpr index_t ndim_new_top = all_up_dim_new_top_ids.Size();
+
+    // low_dim_hidden_idss
+    constexpr auto low_dim_hidden_idss = LowerDimensionOldTopIdss{};
+
+    // up_dim_hidden_idss: shift UpperDimensionNewTopIdss by ndim_bottom
+    constexpr auto up_dim_hidden_idss = generate_tuple(
+        [](auto itran) { return UpperDimensionNewTopIdss{}[itran] + Number<ndim_old_top>{}; },
+        Number<ntransform>{});
+
+    // bottom_dim_hidden_ids
+    constexpr auto bottom_dim_hidden_ids =
+        typename arithmetic_sequence_gen<0, ndim_old_top, 1>::type{};
+
+    // top_dim_hidden_ids
+    constexpr auto top_dim_hidden_ids =
+        typename arithmetic_sequence_gen<0, ndim_new_top, 1>::type{} + Number<ndim_old_top>{};
+
+    return TensorAdaptor<remove_cv_t<Transforms>,
+                         remove_cv_t<decltype(low_dim_hidden_idss)>,
+                         remove_cv_t<decltype(up_dim_hidden_idss)>,
+                         remove_cv_t<decltype(bottom_dim_hidden_ids)>,
+                         remove_cv_t<decltype(top_dim_hidden_ids)>>{transforms};
+}
+
+template <typename X, typename... Xs, typename enable_if<sizeof...(Xs) >= 2, bool>::type = false>
+__host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&... xs)
+{
+    return chain_tensor_adaptors(x, chain_tensor_adaptors(xs...));
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+#ifndef CK_TENSOR_DESCRIPTOR_HPP
+#define CK_TENSOR_DESCRIPTOR_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform.hpp"
+
+namespace ck {
+
+template <index_t NDimHidden, typename VisibleDimensionIds>
+struct TensorCoordinate;
+
+template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
+struct TensorCoordinateStep;
+
+// Transforms: Tuple<transforms...>
+// LowerDimensionIdss : Tuple<Sequence<...>, ...>
+// UpperDimensionIdss : Tuple<Sequence<...>, ...>
+// VisibleDimensionIds> : Sequence<...>
+template <typename Transforms,
+          typename LowerDimensionIdss,
+          typename UpperDimensionIdss,
+          typename VisibleDimensionIds,
+          typename ElementSpaceSize>
+struct TensorDescriptor
+{
+    // TODO make these private
+    __host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); }
+
+    __host__ __device__ static constexpr index_t GetNumOfVisibleDimension()
+    {
+        return VisibleDimensionIds::Size();
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfHiddenDimension()
+    {
+        constexpr auto all_low_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); }, LowerDimensionIdss{});
+
+        constexpr auto all_up_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); }, UpperDimensionIdss{});
+
+        constexpr auto all_dim_ids = merge_sequences(all_low_dim_ids, all_up_dim_ids);
+
+        using unique_sort_all_dim_ids = typename sequence_unique_sort<decltype(all_dim_ids),
+                                                                      math::less<index_t>,
+                                                                      math::equal<index_t>>::type;
+
+        return unique_sort_all_dim_ids::Size();
+    }
+
+    __host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms)
+    {
+        const auto lengths = generate_tuple(
+            [&](auto idim_visible) {
+                constexpr auto tmp = GetTransformAndItsUpperDimension(idim_visible);
+
+                constexpr index_t itran   = tmp[Number<0>{}];
+                constexpr index_t idim_up = tmp[Number<1>{}];
+                constexpr bool found      = tmp[Number<2>{}];
+
+                static_assert(found == true,
+                              "wrong! not found matching transformation and upper-dimension");
+
+                const auto length =
+                    transforms[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
+
+                return length;
+            },
+            Number<ndim_visible_>{});
+
+        // TODO: make container_reduce support tuple of Number and index_t
+        return container_reduce(lengths, math::multiplies{}, Number<1>{});
+    }
+
+    template <index_t IDim>
+    __host__ __device__ static constexpr auto GetTransformAndItsUpperDimension(Number<IDim>)
+    {
+        constexpr auto idim_visible = Number<IDim>{};
+
+        constexpr index_t idim_hidden = VisibleDimensionIds::At(idim_visible);
+
+        index_t itran_found   = 0;
+        index_t idim_up_found = 0;
+        bool found            = false;
+
+        static_for<0, ntransform_, 1>{}([&](auto itran) {
+            constexpr auto up_dim_ids = UpperDimensionIdss{}[itran];
+
+            static_for<0, up_dim_ids.Size(), 1>{}([&](auto idim_up) {
+                if constexpr(up_dim_ids[idim_up] == idim_hidden)
+                {
+                    itran_found   = itran;
+                    idim_up_found = idim_up;
+                    found         = true;
+                }
+            });
+        });
+
+        return make_tuple(itran_found, idim_up_found, found);
+    }
+
+    constexpr static index_t ntransform_   = GetNumOfTransform();
+    constexpr static index_t ndim_visible_ = GetNumOfVisibleDimension();
+    constexpr static index_t ndim_hidden_  = GetNumOfHiddenDimension();
+
+    using VisibleIndex = MultiIndex<ndim_visible_>;
+    using HiddenIndex  = MultiIndex<ndim_hidden_>;
+    using Coordinate   = TensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
+
+    // may be index_t or Number<>
+    using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
+
+    public:
+    __host__ __device__ constexpr TensorDescriptor() = default;
+
+    __host__ __device__ constexpr TensorDescriptor(const Transforms& transforms,
+                                                   ElementSpaceSize element_space_size)
+        : transforms_{transforms},
+          element_size_{InitializeElementSize(transforms)},
+          element_space_size_{element_space_size}
+
+    {
+        static_assert(Transforms::Size() == ntransform_ &&
+                          LowerDimensionIdss::Size() == ntransform_ &&
+                          UpperDimensionIdss::Size() == ntransform_,
+                      "wrong! inconsistent # of transformations");
+
+        // TODO check dependency of dimensions is valid
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfDimension()
+    {
+        return GetNumOfVisibleDimension();
+    }
+
+    template <index_t IDim>
+    __host__ __device__ constexpr auto GetLength(Number<IDim>) const
+    {
+        static_assert(IDim >= 0 && IDim < ndim_visible_, "wrong! out of range");
+
+        constexpr auto tmp = GetTransformAndItsUpperDimension(Number<IDim>{});
+
+        constexpr index_t itran   = tmp[Number<0>{}];
+        constexpr index_t idim_up = tmp[Number<1>{}];
+        constexpr bool found      = tmp[Number<2>{}];
+
+        static_assert(found == true,
+                      "wrong! not found matching transformation and upper-dimension");
+
+        return transforms_[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
+    }
+
+    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
+
+    __host__ __device__ constexpr auto GetElementSpaceSize() const { return element_space_size_; }
+
+    template <typename Idx>
+    __host__ __device__ constexpr index_t CalculateOffset(const Idx& idx) const
+    {
+        static_assert(Idx::Size() == GetNumOfDimension(), "wrong! inconsistent # of dimension");
+
+        return make_tensor_coordinate(*this, idx).GetOffset();
+    }
+
+    // TODO make these private
+    __host__ __device__ constexpr const auto& GetTransforms() const { return transforms_; }
+
+    __host__ __device__ static constexpr auto GetLowerDimensionIdss()
+    {
+        return LowerDimensionIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetUpperDimensionIdss()
+    {
+        return UpperDimensionIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetVisibleDimensionIds()
+    {
+        return VisibleDimensionIds{};
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        bool is_known = true;
+
+        static_for<0, Transforms::Size(), 1>{}([&](auto i) {
+            is_known &=
+                remove_cv_t<remove_reference_t<decltype(Transforms{}[i])>>::IsKnownAtCompileTime();
+        });
+
+        return is_known && is_known_at_compile_time<ElementSize>::value &&
+               is_known_at_compile_time<ElementSpaceSize>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("TensorDescriptor, ");
+        static_for<0, ntransform_, 1>{}([&](auto i) {
+            printf("transforms: ");
+            transforms_[i].Print();
+            printf("LowerDimensionIds:");
+            LowerDimensionIdss{}.At(i).Print();
+            printf("UpperDimensionIds:");
+            UpperDimensionIdss{}.At(i).Print();
+        });
+        printf("}");
+
+        VisibleDimensionIds::Print();
+    }
+
+    // TODO make these private
+    Transforms transforms_;
+    ElementSize element_size_;
+    ElementSpaceSize element_space_size_;
+};
+
+template <index_t NDimHidden, typename VisibleDimensionIds>
+struct TensorCoordinate
+{
+    // TODO make these private
+    static constexpr index_t ndim_visible_ = VisibleDimensionIds::Size();
+
+    using HiddenIndex  = MultiIndex<NDimHidden>;
+    using VisibleIndex = MultiIndex<ndim_visible_>;
+
+    public:
+    __host__ __device__ constexpr TensorCoordinate() = default;
+
+    __host__ __device__ constexpr TensorCoordinate(const HiddenIndex& idx_hidden)
+        : idx_hidden_{idx_hidden}
+    {
+    }
+
+    __host__ __device__ constexpr auto GetIndex() const { return GetVisibleIndex(); }
+
+    __host__ __device__ constexpr index_t GetOffset() const { return idx_hidden_[Number<0>{}]; }
+
+    // TODO make these private
+    __host__ __device__ constexpr const auto& GetHiddenIndex() const { return idx_hidden_; }
+
+    __host__ __device__ auto& GetHiddenIndex() { return idx_hidden_; }
+
+    __host__ __device__ constexpr auto GetVisibleIndex() const
+    {
+        return get_container_subset(idx_hidden_, VisibleDimensionIds{});
+    }
+
+    // TODO make these private
+    HiddenIndex idx_hidden_;
+};
+
+template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
+struct TensorCoordinateStep
+{
+    // TODO make these private
+    using VisibleIndex = MultiIndex<NDimVisible>;
+
+    public:
+    __host__ __device__ constexpr TensorCoordinateStep() = default;
+
+    __host__ __device__ constexpr TensorCoordinateStep(const VisibleIndex& idx_diff_visible,
+                                                       const MultiIndex<NTransform>& do_transforms)
+        : idx_diff_visible_{idx_diff_visible}, do_transforms_{do_transforms}
+    {
+    }
+
+    __host__ __device__ constexpr const auto& GetIndexDiff() const { return GetVisibleIndexDiff(); }
+
+    // TODO make these private
+    __host__ __device__ constexpr const auto& GetVisibleIndexDiff() const
+    {
+        return idx_diff_visible_;
+    }
+
+    VisibleIndex idx_diff_visible_;
+    MultiIndex<NTransform> do_transforms_;
+
+    // HACK: control UpdateLowerIndex()
+    static constexpr UpdateLowerIndexHack update_lower_index_hack_;
+};
+
+// TODO: How to fix this? It uses an struct instead of lambda because lambda
+// doesn't have constructor, and to put it outside the scope where it is used
+// (transform_tensor_descriptor) because template cannot be defined inside a function
+// template
+template <typename NewTransforms>
+struct lambda_get_up_dim_num
+{
+    template <typename I>
+    __host__ __device__ constexpr auto operator()(I) const
+    {
+        using Tran = remove_reference_t<decltype(NewTransforms{}.At(I{}))>;
+        return Number<Tran::GetNumOfUpperDimension()>{};
+    }
+};
+
+template <typename OldTensorDescriptor,
+          typename NewTransforms,
+          typename NewLowerDimensionOldVisibleIdss,
+          typename NewUpperDimensionNewVisibleIdss>
+__host__ __device__ constexpr auto
+transform_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
+                            const NewTransforms& new_transforms,
+                            NewLowerDimensionOldVisibleIdss,
+                            NewUpperDimensionNewVisibleIdss)
+{
+    // sanity check
+    {
+        constexpr auto all_old_top_ids = unpack([](auto... xs) { return merge_sequences(xs...); },
+                                                NewLowerDimensionOldVisibleIdss{});
+
+        constexpr auto all_new_top_ids = unpack([](auto... xs) { return merge_sequences(xs...); },
+                                                NewUpperDimensionNewVisibleIdss{});
+
+        static_assert(is_valid_sequence_map<decltype(all_old_top_ids)>::value &&
+                          is_valid_sequence_map<decltype(all_new_top_ids)>::value,
+                      "wrong!");
+    }
+
+    // lower dimension's hidden idss
+    // convert lower dimension visible idss (tuple of sequences) to hidden idss (tuple of
+    // sequences)
+    constexpr auto low_dim_hidden_idss = transform_tuples(
+        // convert lower dimension visible ids (a sequence) to hidden ids (a sequence)
+        [](auto low_dim_visible_ids) constexpr {
+            return transform_sequences(
+                // convert lower dimension visible id to hidden id
+                [](auto low_dim_visible_id) constexpr {
+                    return OldTensorDescriptor::GetVisibleDimensionIds()[low_dim_visible_id];
+                },
+                low_dim_visible_ids);
+        },
+        NewLowerDimensionOldVisibleIdss{});
+
+    constexpr index_t num_new_transform = NewTransforms::Size();
+
+    // upper dimension's hidden idss
+    constexpr index_t old_hidden_dim_number = OldTensorDescriptor::GetNumOfHiddenDimension();
+
+    constexpr auto up_dim_numbers =
+        generate_sequence(lambda_get_up_dim_num<NewTransforms>{}, Number<num_new_transform>{});
+
+    constexpr auto up_dim_numbers_scan = merge_sequences(
+        Sequence<0>{}, inclusive_scan_sequence(up_dim_numbers, math::plus<index_t>{}, Number<0>{}));
+
+    constexpr auto up_dim_hidden_idss = generate_tuple(
+        [ old_hidden_dim_number, up_dim_numbers_scan ](auto i) constexpr {
+            return
+                typename arithmetic_sequence_gen<old_hidden_dim_number + up_dim_numbers_scan[i],
+                                                 old_hidden_dim_number + up_dim_numbers_scan[i + 1],
+                                                 1>::type{};
+        },
+        Number<num_new_transform>{});
+
+    // new visible dimension's hidden ids
+    constexpr auto unordered_new_visible_dim_hidden_ids = unpack(
+        [](auto... xs) constexpr { return merge_sequences(xs...); }, up_dim_hidden_idss);
+
+    constexpr auto new_visible_dim_unordered2ordered = unpack(
+        [](auto... xs) constexpr { return merge_sequences(xs...); },
+        NewUpperDimensionNewVisibleIdss{});
+
+    constexpr auto new_visible_dim_hidden_ids =
+        unordered_new_visible_dim_hidden_ids.ReorderGivenOld2New(new_visible_dim_unordered2ordered);
+
+    // put everything together
+    const auto all_transforms = container_concat(old_tensor_desc.GetTransforms(), new_transforms);
+
+    constexpr auto all_low_dim_hidden_idss =
+        container_concat(OldTensorDescriptor::GetLowerDimensionIdss(), low_dim_hidden_idss);
+
+    constexpr auto all_up_dim_hidden_idss =
+        container_concat(OldTensorDescriptor::GetUpperDimensionIdss(), up_dim_hidden_idss);
+
+    const auto element_space_size = old_tensor_desc.GetElementSpaceSize();
+
+    return TensorDescriptor<remove_cv_t<decltype(all_transforms)>,
+                            remove_cv_t<decltype(all_low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(all_up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(new_visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{all_transforms,
+                                                                       element_space_size};
+}
+
+template <typename TensorDesc, typename VisibleIndex>
+__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc& tensor_desc,
+                                                          const VisibleIndex& idx_visible)
+{
+    static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
+                  "wrong! # of dimension inconsistent");
+
+    constexpr index_t ntransform   = TensorDesc::GetNumOfTransform();
+    constexpr index_t ndim_hidden  = TensorDesc::GetNumOfHiddenDimension();
+    constexpr auto visible_dim_ids = TensorDesc::GetVisibleDimensionIds();
+
+    MultiIndex<ndim_hidden> idx_hidden;
+
+    // initialize visible index
+    set_container_subset(idx_hidden, visible_dim_ids, idx_visible);
+
+    // calculate hidden index
+    static_for<ntransform, 0, -1>{}([&tensor_desc, &idx_hidden](auto itran_p1) {
+        auto itran              = itran_p1 - Number<1>{};
+        const auto& tran        = tensor_desc.GetTransforms().At(itran);
+        constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
+        constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);
+
+        const auto idx_up = get_container_subset(idx_hidden, dims_up);
+
+        MultiIndex<dims_low.Size()> idx_low;
+
+        tran.CalculateLowerIndex(idx_low, idx_up);
+
+        set_container_subset(idx_hidden, dims_low, idx_low);
+    });
+
+    return TensorCoordinate<ndim_hidden, decltype(visible_dim_ids)>{idx_hidden};
+}
+
+// UpdateLowerIndexHack: Sequence<...>
+// HACK: control UpdateLowerIndex
+template <typename TensorDesc, typename VisibleIndex, typename UpdateLowerIndexHack>
+__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc&,
+                                                               const VisibleIndex& idx_diff_visible,
+                                                               UpdateLowerIndexHack)
+{
+    static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
+                  "wrong! # of dimension inconsistent");
+
+    constexpr index_t ntransform   = TensorDesc::GetNumOfTransform();
+    constexpr index_t ndim_hidden  = TensorDesc::GetNumOfHiddenDimension();
+    constexpr index_t ndim_visible = TensorDesc::GetNumOfVisibleDimension();
+    constexpr auto visible_dim_ids = TensorDesc::GetVisibleDimensionIds();
+
+    static_assert(UpdateLowerIndexHack::Size() == ntransform, "wrong!");
+
+    // use index_t for boolean type
+    auto do_transforms    = make_zero_multi_index<ntransform>();
+    auto is_non_zero_diff = make_zero_multi_index<ndim_hidden>();
+
+    // decide do_transform by checkout non-zero index diff components
+    MultiIndex<VisibleIndex::Size()> non_zero_diff_pick_visible;
+
+    static_for<0, ndim_visible, 1>{}(
+        [&](auto i) { non_zero_diff_pick_visible(i) = (idx_diff_visible[i] != 0); });
+
+    set_container_subset(is_non_zero_diff, visible_dim_ids, non_zero_diff_pick_visible);
+
+    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
+        constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
+        constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);
+
+        const auto non_zero_diff_pick_up = get_container_subset(is_non_zero_diff, dims_up);
+
+        MultiIndex<dims_low.Size()> non_zero_diff_pick_low;
+
+        // if any of upper index diff components is non-zero, then
+        //   1) Need to do this transform
+        //   2) all components of lower index diff will assume to be non-zero and need to be
+        //   computed
+        const bool idx_diff_up_has_non_zero = container_reduce(
+            non_zero_diff_pick_up, [](auto a, auto b) constexpr { return a or b; }, false);
+
+        do_transforms(itran) = idx_diff_up_has_non_zero;
+
+        static_for<0, dims_low.Size(), 1>{}(
+            [&](auto i) { non_zero_diff_pick_low(i) = idx_diff_up_has_non_zero; });
+
+        set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
+    });
+
+    return TensorCoordinateStep<ntransform, ndim_visible, UpdateLowerIndexHack>{idx_diff_visible,
+                                                                                do_transforms};
+}
+
+template <typename TensorDesc, typename VisibleIndex>
+__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc&,
+                                                               const VisibleIndex& idx_diff_visible)
+{
+    constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
+
+    return make_tensor_coordinate_step(
+        TensorDesc{}, idx_diff_visible, typename uniform_sequence_gen<ntransform, 0>::type{});
+}
+
+template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
+__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc& tensor_desc,
+                                                          TensorCoord& coord,
+                                                          const TensorCoordStep& coord_step)
+{
+    constexpr index_t ndim_hidden = TensorDesc::GetNumOfHiddenDimension();
+    constexpr index_t ntransform  = TensorDesc::GetNumOfTransform();
+
+    // this is what needs to be calculated
+    auto idx_diff_hidden = make_zero_multi_index<ndim_hidden>();
+
+    // initialize visible index diff
+    set_container_subset(
+        idx_diff_hidden, TensorDesc::GetVisibleDimensionIds(), coord_step.GetVisibleIndexDiff());
+
+    // this is what needs to be updated
+    auto& idx_hidden = coord.GetHiddenIndex();
+
+    // update visible index
+    auto idx_hidden_pick_visible =
+        get_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds());
+
+    idx_hidden_pick_visible += coord_step.GetIndexDiff();
+
+    set_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds(), idx_hidden_pick_visible);
+
+    // update rest of hidden index
+    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
+        if(coord_step.do_transforms_[itran])
+        {
+            const auto& tran        = tensor_desc.GetTransforms().At(itran);
+            constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
+            constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);
+
+            const auto idx_up_new  = get_container_subset(idx_hidden, dims_up);
+            auto idx_low           = get_container_subset(idx_hidden, dims_low);
+            const auto idx_diff_up = get_container_subset(idx_diff_hidden, dims_up);
+
+            MultiIndex<dims_low.Size()> idx_diff_low;
+
+            // HACK: control UpdateLowerIndex for Merge using hack
+            constexpr index_t Hack = decltype(coord_step.update_lower_index_hack_)::At(itran);
+
+            tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+
+            set_container_subset(idx_diff_hidden, dims_low, idx_diff_low);
+            set_container_subset(idx_hidden, dims_low, idx_low);
+        }
+    });
+}
+
+template <typename TensorDesc, typename TensorCoord>
+__host__ __device__ constexpr bool
+coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc& tensor_desc,
+                                                            const TensorCoord& coord)
+{
+    bool valid = true;
+
+    constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
+
+    const auto& idx_hidden = coord.GetHiddenIndex();
+
+    static_for<ntransform - 1, -1, -1>{}([&tensor_desc, &idx_hidden, &valid](auto itran) {
+        const auto tran = tensor_desc.GetTransforms().At(itran);
+
+        // check validity, only if current transformation does not always has a valid mapping
+        if constexpr(!decltype(tran)::IsValidUpperIndexAlwaysMappedToValidLowerIndex())
+        {
+            const auto idx_up =
+                get_container_subset(idx_hidden, TensorDesc::GetUpperDimensionIdss().At(itran));
+
+            // Comment: using valid = valid && .. will result in weird control flow in ISA
+            valid &= tran.IsValidUpperIndexMappedToValidLowerIndex(idx_up);
+        }
+    });
+
+    return valid;
+}
+
+template <typename TensorDesc, typename TensorCoord>
+__host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc& tensor_desc,
+                                                               const TensorCoord& coord)
+{
+    // check visible index
+    const auto& idx_visible = coord.GetVisibleIndex();
+
+    bool is_visible_index_valid = true;
+
+    static_for<0, TensorDesc::GetNumOfDimension(), 1>{}(
+        [&is_visible_index_valid, &idx_visible, &tensor_desc](auto i) {
+            is_visible_index_valid =
+                is_visible_index_valid &&
+                (idx_visible[i] >= 0 && idx_visible[i] < tensor_desc.GetLength(i));
+        });
+
+    // check other hidden index
+    return is_visible_index_valid &&
+           coordinate_has_valid_offset_assuming_visible_index_is_valid(tensor_desc, coord);
+}
+
+template <typename TensorDesc>
+using TensorCoordinate_t = decltype(make_tensor_coordinate(
+    TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
+
+template <typename TensorDesc>
+using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(
+    TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+#ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP
+#define CK_TENSOR_DESCRIPTOR_HELPER_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "multi_index_transform_helper.hpp"
+
+namespace ck {
+
+/*
+ * These functions create tensor descriptor at runtime. If they are not constexpr, you will
+ * likely see usage of scratch memory during construction of these tensor descriptors. So
+ * it's better to call these functions on host and then pass the constructed tensor descritpors
+ * to GPU. If the tensor descritpors being constructed are constexpr, then you can call these
+ * functions on GPU without worrying about scratch memory usage.
+ */
+
+#if CK_WORKAROUND_SWDEV_275126
+template <typename Lengths, typename Strides, index_t I, typename AccOld>
+__host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengths& lengths,
+                                                                     const Strides& strides,
+                                                                     Number<I> i,
+                                                                     AccOld acc_old)
+{
+    auto acc_new = acc_old + (lengths[i] - Number<1>{}) * strides[i];
+
+    if constexpr(i.value < Lengths::Size() - 1)
+    {
+        return calculate_element_space_size_impl(lengths, strides, i + Number<1>{}, acc_new);
+    }
+    else
+    {
+        return acc_new;
+    }
+}
+#endif
+
+template <typename... Lengths,
+          typename... Strides,
+          typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
+__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Lengths...>& lengths,
+                                                                const Tuple<Strides...>& strides)
+{
+    constexpr index_t N = sizeof...(Lengths);
+
+    const auto transforms = make_tuple(make_embed_transform(lengths, strides));
+
+    constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
+
+    constexpr auto up_dim_hidden_idss =
+        make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
+
+    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
+
+#if !CK_WORKAROUND_SWDEV_275126
+    // rocm-4.1 compiler would crash for recursive labmda
+    // recursive function for reduction
+    auto f = [&](auto fs, auto i, auto acc_old) {
+        auto acc_new = acc_old + (lengths[i] - Number<1>{}) * strides[i];
+
+        if constexpr(i.value < N - 1)
+        {
+            return fs(fs, i + Number<1>{}, acc_new);
+        }
+        else
+        {
+            return acc_new;
+        }
+    };
+
+    const auto element_space_size = f(f, Number<0>{}, Number<1>{});
+#else
+    const auto element_space_size =
+        calculate_element_space_size_impl(lengths, strides, Number<0>{}, Number<1>{});
+#endif
+
+    return TensorDescriptor<remove_cv_t<decltype(transforms)>,
+                            remove_cv_t<decltype(low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{transforms,
+                                                                       element_space_size};
+}
+
+// Lengths... can be:
+//   1) index_t, which is known at run-time
+//   2) Number<>, which is known at compile-time
+template <typename... Lengths>
+__host__ __device__ constexpr auto
+make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
+{
+    constexpr index_t N = sizeof...(Lengths);
+
+    const auto transforms = make_tuple(make_unmerge_transform(lengths));
+
+    constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
+
+    constexpr auto up_dim_hidden_idss =
+        make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
+
+    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
+
+    const auto element_space_size = container_reduce(lengths, math::multiplies{}, Number<1>{});
+
+    return TensorDescriptor<remove_cv_t<decltype(transforms)>,
+                            remove_cv_t<decltype(low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{transforms,
+                                                                       element_space_size};
+}
+
+template <typename... Lengths, typename Align>
+__host__ __device__ constexpr auto
+make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align align)
+{
+    constexpr auto I1 = Number<1>{};
+
+    constexpr index_t N = sizeof...(Lengths);
+
+    const auto stride_n_minus_2 = math::integer_least_multiple(lengths[Number<N - 1>{}], align);
+
+    auto strides = generate_tuple(
+        [&](auto i) {
+            if constexpr(i.value == N - 1)
+            {
+                return I1;
+            }
+            else if constexpr(i.value == N - 2)
+            {
+                return Number<stride_n_minus_2>{};
+            }
+            else
+            {
+                return container_reduce(lengths,
+                                        math::multiplies{},
+                                        Number<stride_n_minus_2>{},
+                                        i + I1,
+                                        Number<N - 1>{},
+                                        I1);
+            }
+        },
+        Number<N>{});
+
+    return make_naive_tensor_descriptor(lengths, strides);
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
+#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
+#define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
+
+#include "common_header.hpp"
+#include "tensor_adaptor.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_contraction_dlops.hpp"
+
+namespace ck {
+
+// C[M0, M1, N0, N1] += transpose(A[K, M0, M1]) * B[K, N0, N1]
+// A and B are visable to the whole block, C is distributed among each thread
+// Assume:
+//   1. A:
+//     1. AKMBlockDesc is known at compile-time
+//     2. ABlockBuffer is DynamicBuffer
+//   2. B:
+//     1. BKNBlockDesc is known at compile-time
+//     2. BBlockBuffer is DynamicBuffer
+//   3. C:
+//     1. CM0M1N0N1ThreadDesc is known at compile-time
+//     2. CThreadBuffer is StaticBuffer
+// Also assume:
+//   M0 = N0 = 2. It will do 2x2 pipelined read and fma (ABBA optimization)
+template <
+    index_t BlockSize,
+    typename FloatA,
+    typename FloatB,
+    typename FloatC,
+    typename AKMBlockDesc,
+    typename BKNBlockDesc,
+    index_t M1PerThreadM11,
+    index_t N1PerThreadN11,
+    index_t KPerThread,
+    index_t M1N1ThreadClusterM100,
+    index_t M1N1ThreadClusterN100,
+    index_t M1N1ThreadClusterM101,
+    index_t M1N1ThreadClusterN101,
+    index_t AThreadCopyScalarPerVector_M11,
+    index_t BThreadCopyScalarPerVector_N11,
+    typename enable_if<AKMBlockDesc::IsKnownAtCompileTime() && BKNBlockDesc::IsKnownAtCompileTime(),
+                       bool>::type = false>
+struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
+{
+    using AIndex = MultiIndex<3>;
+    using BIndex = MultiIndex<3>;
+    using CIndex = MultiIndex<4>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr index_t K = AKMBlockDesc{}.GetLength(I0);
+    static constexpr index_t M = AKMBlockDesc{}.GetLength(I1);
+    static constexpr index_t N = BKNBlockDesc{}.GetLength(I1);
+
+    static constexpr index_t M100 = M1N1ThreadClusterM100;
+    static constexpr index_t N100 = M1N1ThreadClusterN100;
+
+    static constexpr index_t M101 = M1N1ThreadClusterM101;
+    static constexpr index_t N101 = M1N1ThreadClusterN101;
+
+    static constexpr index_t M11 = M1PerThreadM11;
+    static constexpr index_t N11 = N1PerThreadN11;
+
+    static constexpr index_t M1 = M1N1ThreadClusterM100 * M1N1ThreadClusterM101 * M1PerThreadM11;
+    static constexpr index_t N1 = M1N1ThreadClusterN100 * M1N1ThreadClusterN101 * N1PerThreadN11;
+
+    static constexpr index_t M0 = M / M1;
+    static constexpr index_t N0 = N / N1;
+
+    __host__ __device__ static constexpr auto
+    MakeAKM0M1BlockDescriptor(const AKMBlockDesc& /* a_k_m_block_desc */)
+    {
+        const auto a_k_m0_m1_block_desc = transform_tensor_descriptor(
+            AKMBlockDesc{},
+            make_tuple(make_pass_through_transform(Number<K>{}),
+                       make_unmerge_transform(make_tuple(Number<M0>{}, Number<M1>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return a_k_m0_m1_block_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBKN0N1BlockDescriptor(const BKNBlockDesc& /* b_k_n_block_desc */)
+    {
+        const auto b_k_n0_n1_block_desc = transform_tensor_descriptor(
+            BKNBlockDesc{},
+            make_tuple(make_pass_through_transform(Number<K>{}),
+                       make_unmerge_transform(make_tuple(Number<N0>{}, Number<N1>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return b_k_n0_n1_block_desc;
+    }
+
+    __host__ __device__ static constexpr auto MakeCM0M100M101M11N0N100N101N11ToMNBlockAdaptor()
+    {
+        // upper: [M0, M100, M101, M11, N0, N100, N101, N11]
+        // lower: [M, N]
+        constexpr auto c_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n_block_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_unmerge_transform(make_tuple(
+                               Number<M0>{}, Number<M100>{}, Number<M101>{}, Number<M11>{})),
+                           make_unmerge_transform(make_tuple(
+                               Number<N0>{}, Number<N100>{}, Number<N101>{}, Number<N11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4, 5, 6, 7>{}));
+
+        return c_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n_block_adaptor;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M100M101M11N0N100N101N11ToM0M1N0N1BlockAdaptor()
+    {
+        // upper: [M0, M100, M101, M11, N0, N100, N101, N11]
+        // lower: [M0, M1, N0, N1]
+        constexpr auto c_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1_block_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_pass_through_transform(Number<M0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<M100>{}, Number<M101>{}, Number<M11>{})),
+                           make_pass_through_transform(Number<N0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<N100>{}, Number<N101>{}, Number<N11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}, Sequence<5, 6, 7>{}));
+
+        return c_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1_block_adaptor;
+    }
+
+    __host__ __device__ static constexpr auto GetCM0M1N0N1ThreadTensorLengths()
+    {
+        return Sequence<M0, M11, N0, N11>{};
+    }
+
+    static constexpr auto a_k_m0_m1_block_desc_ = MakeAKM0M1BlockDescriptor(AKMBlockDesc{});
+    static constexpr auto b_k_n0_n1_block_desc_ = MakeBKN0N1BlockDescriptor(BKNBlockDesc{});
+
+    public:
+    __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2()
+        : c_thread_origin_data_idx_{CalculateCM0M1N0N1ThreadOriginOnBlock(
+              get_thread_local_1d_id())},
+          a_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I0], c_thread_origin_data_idx_[I1])},
+          b_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I2], c_thread_origin_data_idx_[I3])}
+    {
+        static_assert(AKMBlockDesc::IsKnownAtCompileTime() && BKNBlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(BlockSize == M101 * M100 * N101 * N100,
+                      "wrong! blocksize and cluster size not consistent");
+
+        static_assert(M % M1 == 0 && N % N1 == 0, "wrong!");
+
+        static_assert(AKMBlockDesc{}.GetLength(I0) == BKNBlockDesc{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        // TODO: remove this restriction
+        static_assert(M0 == 2 && N0 == 2, "wrong");
+    }
+
+    __device__ static CIndex CalculateCM0M1N0N1ThreadOriginOnBlock(index_t thread_id)
+    {
+        // lower: [M0, M1, N0, N1]
+        // upper: [M0, M100, M101, M11, N0, N100, N101, N11]
+        constexpr auto adaptor0 = MakeCM0M100M101M11N0N100N101N11ToM0M1N0N1BlockAdaptor();
+
+        // lower: [M0, M100, M101, M11, N0, N100, N101, N11]
+        // upper: [Tid, M0, M11, N0, N11]
+        constexpr auto adaptor1 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(M100, N100, M101, N101)),
+                       make_pass_through_transform(M0),
+                       make_pass_through_transform(M11),
+                       make_pass_through_transform(N0),
+                       make_pass_through_transform(N11)),
+            make_tuple(
+                Sequence<1, 5, 2, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        constexpr auto adaptor = chain_tensor_adaptors(adaptor0, adaptor1);
+
+        return adaptor.CalculateBottomIndex(make_multi_index(thread_id, 0, 0, 0, 0));
+    }
+
+    __host__ __device__ static constexpr index_t GetABlockAlignment() { return M1PerThreadM11; }
+
+    __host__ __device__ static constexpr auto GetBBlockAlignment() { return N1PerThreadN11; }
+
+    template <typename CM0M1N0N1ThreadDesc,
+              typename ABlockBuffer,
+              typename BBlockBuffer,
+              typename CThreadBuffer>
+    __device__ void Run(const CM0M1N0N1ThreadDesc& /* c_m0_m1_n0_n1_thread_desc */,
+                        const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(CM0M1N0N1ThreadDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: remove this restriction
+        static_assert(M0 == 2 && N0 == 2 && CM0M1N0N1ThreadDesc{}.GetLength(I0) == M0 &&
+                          CM0M1N0N1ThreadDesc{}.GetLength(I2) == N0,
+                      "wrong");
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatA>(
+            a_k_m0_m1_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatB>(
+            b_k_n0_n1_thread_desc_.GetElementSpaceSize());
+
+        constexpr auto threadwise_gemm =
+            ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1<FloatA,
+                                                     FloatB,
+                                                     FloatC,
+                                                     decltype(a_k_m0_m1_thread_desc_),
+                                                     decltype(b_k_n0_n1_thread_desc_),
+                                                     CM0M1N0N1ThreadDesc,
+                                                     Sequence<KPerThread>,
+                                                     Sequence<1, M1PerThreadM11>,
+                                                     Sequence<1, N1PerThreadN11>>{};
+
+        // read A_sub_0
+        a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                           make_tuple(I0, I0, I0),
+                           a_block_buf,
+                           a_k_m0_m1_thread_desc_,
+                           make_tuple(I0, I0, I0),
+                           a_thread_buf);
+
+        // read B_sub_0
+        b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                           make_tuple(I0, I0, I0),
+                           b_block_buf,
+                           b_k_n0_n1_thread_desc_,
+                           make_tuple(I0, I0, I0),
+                           b_thread_buf);
+
+        // read B_sub_1
+        b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                           make_tuple(I0, I1, I0),
+                           b_block_buf,
+                           b_k_n0_n1_thread_desc_,
+                           make_tuple(I0, I1, I0),
+                           b_thread_buf);
+
+        // read A_sub_1
+        a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                           make_tuple(I0, I1, I0),
+                           a_block_buf,
+                           a_k_m0_m1_thread_desc_,
+                           make_tuple(I0, I1, I0),
+                           a_thread_buf);
+
+        // C_sub_00 += transpose(A_sub_0) * B_sub_0
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            c_thread_buf,
+                            make_tuple(I0, I0, I0, I0));
+
+        // C_sub_01 += transpose(A_sub_0) * B_sub_1
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            c_thread_buf,
+                            make_tuple(I0, I0, I1, I0));
+
+        // loop over rest of k
+        static_for<KPerThread, K, KPerThread>{}([&](auto k) {
+            // read A_sub_0
+            a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                               make_tuple(k, I0, I0),
+                               a_block_buf,
+                               a_k_m0_m1_thread_desc_,
+                               make_tuple(I0, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_10 += transpose(A_sub_1) * B_sub_0
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                c_thread_buf,
+                                make_tuple(I1, I0, I0, I0));
+
+            // read B_sub_0
+            b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                               make_tuple(k, I0, I0),
+                               b_block_buf,
+                               b_k_n0_n1_thread_desc_,
+                               make_tuple(I0, I0, I0),
+                               b_thread_buf);
+
+            // C_sub_11 += transpose(A_sub_1) * B_sub_1
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                c_thread_buf,
+                                make_tuple(I1, I0, I1, I0));
+
+            // read B_sub_1
+            b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                               make_tuple(k, I1, I0),
+                               b_block_buf,
+                               b_k_n0_n1_thread_desc_,
+                               make_tuple(I0, I1, I0),
+                               b_thread_buf);
+
+            // read A_sub_1
+            a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                               make_tuple(k, I1, I0),
+                               a_block_buf,
+                               a_k_m0_m1_thread_desc_,
+                               make_tuple(I0, I1, I0),
+                               a_thread_buf);
+
+            // C_sub_00 += transpose(A_sub_0) * B_sub_0
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                c_thread_buf,
+                                make_tuple(I0, I0, I0, I0));
+
+            // C_sub_01 += transpose(A_sub_0) * B_sub_1
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                c_thread_buf,
+                                make_tuple(I0, I0, I1, I0));
+        });
+
+        // C_sub_10 += transpose(A_sub_1) * B_sub_0
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            c_thread_buf,
+                            make_tuple(I1, I0, I0, I0));
+
+        // C_sub_11 += transpose(A_sub_1) * B_sub_1
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            c_thread_buf,
+                            make_tuple(I1, I0, I1, I0));
+    }
+
+    private:
+    // A[K, M0, M1]
+    static constexpr auto a_k_m0_m1_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<KPerThread>{}, Number<M0>{}, Number<M1PerThreadM11>{}));
+
+    // B[K, N0, N1]
+    static constexpr auto b_k_n0_n1_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<KPerThread>{}, Number<N0>{}, Number<N1PerThreadN11>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
+                                                         decltype(a_k_m0_m1_block_desc_),
+                                                         decltype(a_k_m0_m1_thread_desc_),
+                                                         Sequence<KPerThread, 1, M1PerThreadM11>,
+                                                         Sequence<0, 1, 2>,
+                                                         2,
+                                                         AThreadCopyScalarPerVector_M11,
+                                                         1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         FloatB,
+                                                         decltype(b_k_n0_n1_block_desc_),
+                                                         decltype(b_k_n0_n1_thread_desc_),
+                                                         Sequence<KPerThread, 1, N1PerThreadN11>,
+                                                         Sequence<0, 1, 2>,
+                                                         2,
+                                                         BThreadCopyScalarPerVector_N11,
+                                                         1>;
+
+    CIndex c_thread_origin_data_idx_;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
+#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
+#define CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
+
+#include "common_header.hpp"
+#include "tensor_adaptor.hpp"
+#include "threadwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_contraction_dlops.hpp"
+
+namespace ck {
+
+// C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
+// A and B are visable to the whole block, C is distributed among each thread
+// Assume:
+//   1. A:
+//     1. ABlockDesc_BK0_BM_BK1 is known at compile-time
+//     2. ABlockBuffer is DynamicBuffer
+//   2. B:
+//     1. BBlockDesc_BK0_BN_BK1 is known at compile-time
+//     2. BBlockBuffer is DynamicBuffer
+//   3. C:
+//     1. CThreadDesc_BM0_BM11_BN0_BN11 is known at compile-time
+//     2. CThreadBuffer is StaticBuffer
+// Also assume:
+//   BM10BN10ThreadClusterBM10Xs::Size() = BM10BN10ThreadClusterBN10Xs::Size() == 2
+//   BM0 = BN0 = 2. It will do 2x2 pipelined read and fma (ABBA optimization)
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ABlockDesc_BK0_BM_BK1,
+          typename BBlockDesc_BK0_BN_BK1,
+          index_t BM1PerThreadBM11,
+          index_t BN1PerThreadBN11,
+          index_t BK0PerThread,
+          typename BM10BN10ThreadClusterBM10Xs, // Sequence<BM10BN10ThreadClusterBM100,
+                                                //          BM10BN10ThreadClusterBM101, ...>
+          typename BM10BN10ThreadClusterBN10Xs, // Sequence<BM10BN10ThreadClusterBN100,
+                                                //          BM10BN10ThreadClusterBN101, ...>
+          index_t AThreadCopyScalarPerVector_BM11,
+          index_t BThreadCopyScalarPerVector_BN11,
+          typename enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                                 BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
+{
+    using AIndex = MultiIndex<3>;
+    using BIndex = MultiIndex<3>;
+    using CIndex = MultiIndex<4>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr index_t BK0 = ABlockDesc_BK0_BM_BK1{}.GetLength(I0);
+    static constexpr index_t BK1 = ABlockDesc_BK0_BM_BK1{}.GetLength(I2);
+    static constexpr index_t BM  = ABlockDesc_BK0_BM_BK1{}.GetLength(I1);
+    static constexpr index_t BN  = BBlockDesc_BK0_BN_BK1{}.GetLength(I1);
+
+    static constexpr index_t BM100 = BM10BN10ThreadClusterBM10Xs{}[I0];
+    static constexpr index_t BN100 = BM10BN10ThreadClusterBN10Xs{}[I0];
+
+    static constexpr index_t BM101 = BM10BN10ThreadClusterBM10Xs{}[I1];
+    static constexpr index_t BN101 = BM10BN10ThreadClusterBN10Xs{}[I1];
+
+    static constexpr index_t BM11 = BM1PerThreadBM11;
+    static constexpr index_t BN11 = BN1PerThreadBN11;
+
+    static constexpr index_t BM1 = BM100 * BM101 * BM11;
+    static constexpr index_t BN1 = BN100 * BN101 * BN11;
+
+    static constexpr index_t BM0 = BM / BM1;
+    static constexpr index_t BN0 = BN / BN1;
+
+    __host__ __device__ static constexpr auto
+    MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1& a_block_desc_bk0_bm_bk1)
+    {
+        const auto a_block_bk0_bm0_bm1_bk1 = transform_tensor_descriptor(
+            a_block_desc_bk0_bm_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BM0>{}, Number<BM1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return a_block_bk0_bm0_bm1_bk1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1& b_block_desc_bk0_bn_bk1)
+    {
+        const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_tensor_descriptor(
+            b_block_desc_bk0_bn_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BN0>{}, Number<BN1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return b_block_desc_bk0_bn0_bn1_bk1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM, BN]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_unmerge_transform(make_tuple(
+                               Number<BM0>{}, Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_unmerge_transform(make_tuple(
+                               Number<BN0>{}, Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4, 5, 6, 7>{}));
+
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM0, BM1, BN0, BN1]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1 =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_pass_through_transform(Number<BM0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_pass_through_transform(Number<BN0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}, Sequence<5, 6, 7>{}));
+
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1;
+    }
+
+    __host__ __device__ static constexpr auto GetCThreadTensorLengths_BM0_BM1_BN0_BN1()
+    {
+        return Sequence<BM0, BM11, BN0, BN11>{};
+    }
+
+    static constexpr auto a_block_desc_bk0_bm0_bm1_bk1_ =
+        MakeABlockDescriptor_BK0_BM0_BM1_BK1(ABlockDesc_BK0_BM_BK1{});
+
+    static constexpr auto b_block_desc_bk0_bn0_bn1_bk1_ =
+        MakeBBlockDescriptor_BK0_BN0_BN1_BK1(BBlockDesc_BK0_BN_BK1{});
+
+    public:
+    __device__ BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2()
+        : c_thread_origin_data_idx_{CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+              get_thread_local_1d_id())},
+          a_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I0], c_thread_origin_data_idx_[I1], 0)},
+          b_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I2], c_thread_origin_data_idx_[I3], 0)}
+    {
+        static_assert(ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                          BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(BlockSize == BM101 * BM100 * BN101 * BN100,
+                      "wrong! blocksize and cluster size not consistent");
+
+        static_assert(BM % BM1 == 0 && BN % BN1 == 0, "wrong!");
+
+        static_assert(ABlockDesc_BK0_BM_BK1{}.GetLength(I0) ==
+                          BBlockDesc_BK0_BN_BK1{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        // TODO remove this restriction
+        static_assert(BM10BN10ThreadClusterBM10Xs::Size() == 2 &&
+                          BM10BN10ThreadClusterBN10Xs::Size() == 2,
+                      "wrong!");
+
+        // TODO: remove this restriction
+        static_assert(BM0 == 2 && BN0 == 2, "wrong");
+    }
+
+    __device__ static CIndex CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(index_t thread_id)
+    {
+        // lower: [BM0, BM1, BN0, BN1]
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        constexpr auto adaptor0 =
+            MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1();
+
+        // lower: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // upper: [Tid, BM0, BM11, BN0, BN11]
+        constexpr auto adaptor1 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(BM100, BN100, BM101, BN101)),
+                       make_pass_through_transform(BM0),
+                       make_pass_through_transform(BM11),
+                       make_pass_through_transform(BN0),
+                       make_pass_through_transform(BN11)),
+            make_tuple(
+                Sequence<1, 5, 2, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        constexpr auto adaptor = chain_tensor_adaptors(adaptor0, adaptor1);
+
+        return adaptor.CalculateBottomIndex(make_multi_index(thread_id, 0, 0, 0, 0));
+    }
+
+    template <typename CThreadDesc_BM0_BM11_BN0_BN11,
+              typename ABlockBuffer,
+              typename BBlockBuffer,
+              typename CThreadBuffer>
+    __device__ void Run(const CThreadDesc_BM0_BM11_BN0_BN11&,
+                        const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(CThreadDesc_BM0_BM11_BN0_BN11::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: remove this restriction
+        static_assert(BM0 == 2 && BN0 == 2 &&
+                          CThreadDesc_BM0_BM11_BN0_BN11{}.GetLength(I0) == BM0 &&
+                          CThreadDesc_BM0_BM11_BN0_BN11{}.GetLength(I2) == BN0,
+                      "wrong");
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatA>(
+            a_thread_desc_bk0_bm0_bm1_bk1_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatB>(
+            b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize());
+
+        constexpr auto threadwise_contraction =
+            ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
+                FloatA,
+                FloatB,
+                FloatC,
+                decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+                decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+                CThreadDesc_BM0_BM11_BN0_BN11,
+                Sequence<BK0PerThread, BK1>,
+                Sequence<1, BM1PerThreadBM11>,
+                Sequence<1, BN1PerThreadBN11>>{};
+
+        // read A_sub_0
+        a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           a_thread_buf);
+
+        // read B_sub_0
+        b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           b_thread_buf);
+
+        // read B_sub_1
+        b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           b_thread_buf);
+
+        // read A_sub_1
+        a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           a_thread_buf);
+
+        // C_sub_00 += transpose(A_sub_0) * B_sub_0
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I0, I0, I0, I0));
+
+        // C_sub_01 += transpose(A_sub_0) * B_sub_1
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I0, I0, I1, I0));
+
+        // loop over rest of bk0
+        static_for<BK0PerThread, BK0, BK0PerThread>{}([&](auto bk0) {
+            // read A_sub_0
+            a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(bk0, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_10 += transpose(A_sub_1) * B_sub_0
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I1, I0, I0, I0));
+
+            // read B_sub_0
+            b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(bk0, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(I0, I0, I0, I0),
+                               b_thread_buf);
+
+            // C_sub_11 += transpose(A_sub_1) * B_sub_1
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I1, I0, I1, I0));
+
+            // read B_sub_1
+            b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(bk0, I1, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(I0, I1, I0, I0),
+                               b_thread_buf);
+
+            // read A_sub_1
+            a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(bk0, I1, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(I0, I1, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_00 += transpose(A_sub_0) * B_sub_0
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I0, I0, I0, I0));
+
+            // C_sub_01 += transpose(A_sub_0) * B_sub_1
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I0, I0, I1, I0));
+        });
+
+        // C_sub_10 += transpose(A_sub_1) * B_sub_0
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I1, I0, I0, I0));
+
+        // C_sub_11 += transpose(A_sub_1) * B_sub_1
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I1, I0, I1, I0));
+    }
+
+    private:
+    // A[BK0, BM0, BM1, BK1]
+    static constexpr auto a_thread_desc_bk0_bm0_bm1_bk1_ =
+        make_naive_tensor_descriptor_packed(make_tuple(
+            Number<BK0PerThread>{}, Number<BM0>{}, Number<BM1PerThreadBM11>{}, Number<BK1>{}));
+
+    // B[BK0, BN0, BN1, BK1]
+    static constexpr auto b_thread_desc_bk0_bn0_bn1_bk1_ =
+        make_naive_tensor_descriptor_packed(make_tuple(
+            Number<BK0PerThread>{}, Number<BN0>{}, Number<BN1PerThreadBN11>{}, Number<BK1>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
+        FloatA,
+        FloatA,
+        decltype(a_block_desc_bk0_bm0_bm1_bk1_),
+        decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+        Sequence<BK0PerThread, 1, BM1PerThreadBM11, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                             // DimAccessOrder
+        Sequence<1, 1, BM1PerThreadBM11, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                            // SrcVectorTensorContiguousDimOrder
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
+        FloatB,
+        FloatB,
+        decltype(b_block_desc_bk0_bn0_bn1_bk1_),
+        decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+        Sequence<BK0PerThread, 1, BN1PerThreadBN11, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                             // DimAccessOrder
+        Sequence<1, 1, BN1PerThreadBN11, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                            // SrcVectorTensorContiguousDimOrder
+
+    CIndex c_thread_origin_data_idx_;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+#ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
+#define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
+
+#include "common_header.hpp"
+#include "threadwise_gemm_dlops_v3.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename BlockMatrixA,
+          typename BlockMatrixB,
+          typename ThreadMatrixC,
+          index_t KPerThread,
+          index_t HPerThread,
+          index_t WPerThread,
+          index_t EPerThreadLoop,
+          index_t ThreadGemmADataPerRead_K,
+          index_t ThreadGemmBDataPerRead_W>
+struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
+{
+    struct MatrixIndex
+    {
+        index_t k;
+        index_t h;
+        index_t w;
+    };
+
+    // HACK: fix this @Jing Zhang
+    static constexpr index_t KPerThreadSubC = 4;
+
+    static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadSubC>{}));
+
+    static constexpr auto b_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
+        Number<EPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
+
+    static constexpr auto c_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
+        Number<KPerThreadSubC>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
+                                                         BlockMatrixA,
+                                                         decltype(a_thread_mtx_),
+                                                         Sequence<EPerThreadLoop, KPerThreadSubC>,
+                                                         Sequence<0, 1>,
+                                                         1,
+                                                         ThreadGemmADataPerRead_K,
+                                                         1>;
+
+    __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3()
+        : c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())},
+          a_thread_copy_{make_tuple(0, c_thread_begin_mtx_idx_.k * KPerThread)}
+    {
+        static_assert(BlockMatrixA::IsKnownAtCompileTime() &&
+                          BlockMatrixB::IsKnownAtCompileTime() &&
+                          ThreadMatrixC::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
+                      "wrong! K dimension not consistent\n");
+
+        constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed
+        constexpr index_t H = BlockMatrixB{}.GetLength(I2);
+        constexpr index_t W = BlockMatrixB{}.GetLength(I3);
+
+        static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0,
+                      "wrong! Cannot evenly divide work among\n");
+
+        constexpr auto KThreadCluster = K / KPerThread;
+        constexpr auto HThreadCluster = H / HPerThread;
+        constexpr auto WThreadCluster = W / WPerThread;
+
+        static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster,
+                      "wrong! wrong blocksize\n");
+    }
+
+    __device__ static constexpr auto GetThreadMatrixCLengths()
+    {
+        return Sequence<KPerThread, 1, HPerThread, WPerThread>{};
+    }
+
+    __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
+    {
+        constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
+        constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
+
+        constexpr auto num_w_threads  = W / WPerThread;
+        constexpr auto num_h_threads  = H / HPerThread;
+        constexpr auto num_hw_threads = num_w_threads * num_h_threads;
+
+        index_t k_thread_id  = thread_id / num_hw_threads;
+        index_t hw_thread_id = thread_id % num_hw_threads;
+
+        index_t h_thread_id = hw_thread_id / num_w_threads;
+        index_t w_thread_id = hw_thread_id % num_w_threads;
+
+        return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
+    }
+
+    template <typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BThreadBuffer& b_thread_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename ABlockBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatA>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename BThreadBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatB>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename CThreadBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatC>>>::value &&
+                      "wrong! inconsistent type");
+
+        constexpr auto I0 = Number<0>{};
+
+        constexpr auto a_block_mtx = BlockMatrixA{};
+
+        constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
+
+        // HACK: fix this @Jing Zhang
+        constexpr auto HoPerThreadSubC = 2;
+        constexpr auto WoPerThreadSubC = 2;
+
+        static_assert(KPerThread % KPerThreadSubC == 0, "");
+        static_assert(HPerThread % HoPerThreadSubC == 0, "");
+        static_assert(WPerThread % WoPerThreadSubC == 0, "");
+
+        // thread A buffer for GEMM
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize(), true>
+            a_thread_buf;
+
+        constexpr auto threadwise_gemm = ThreadwiseGemmDlops_km_kn_mn_v3<FloatA,
+                                                                         FloatB,
+                                                                         FloatC,
+                                                                         decltype(a_thread_mtx_),
+                                                                         decltype(b_thread_mtx_),
+                                                                         decltype(c_thread_mtx_),
+                                                                         HoPerThreadSubC,
+                                                                         WoPerThreadSubC>{};
+
+        static_for<0, EPerBlock, EPerThreadLoop>{}([&](auto e_begin) {
+            static_for<0, KPerThread, KPerThreadSubC>{}([&](auto k_begin) {
+                a_thread_copy_.Run(a_block_mtx,
+                                   make_tuple(e_begin, k_begin),
+                                   a_block_buf,
+                                   a_thread_mtx_,
+                                   make_tuple(I0, I0),
+                                   a_thread_buf);
+
+                static_for<0, HPerThread, HoPerThreadSubC>{}([&](auto h_begin) {
+                    static_for<0, WPerThread, WoPerThreadSubC>{}([&](auto w_begin) {
+                        threadwise_gemm.Run(a_thread_buf,
+                                            make_tuple(I0, I0),
+                                            b_thread_buf,
+                                            make_tuple(e_begin, I0, h_begin, w_begin),
+                                            c_thread_buf,
+                                            make_tuple(k_begin, I0, h_begin, w_begin));
+                    });
+                });
+            });
+        });
+    }
+
+    template <typename ABlockSliceMoveStepIdx>
+    __device__ void MoveASliceWindow(const BlockMatrixA&,
+                                     const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx)
+    {
+        a_thread_copy_.MoveSrcSliceWindow(BlockMatrixA{}, a_block_slice_move_step_idx);
+    }
+
+    private:
+    MatrixIndex c_thread_begin_mtx_idx_;
+
+    AThreadCopy a_thread_copy_;
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+#ifndef CK_BLOCKWISE_GEMM_XDLOPS_HPP
+#define CK_BLOCKWISE_GEMM_XDLOPS_HPP
+
+#include "common_header.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "xdlops_gemm.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          class ABlockDesc,
+          class BBlockDesc,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t K1>
+struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
+{
+
+    using CIndex = MultiIndex<2>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr index_t WaveSize = 64;
+
+    static constexpr index_t M0 = ABlockDesc{}.GetLength(I1);
+    static constexpr index_t M1 = ABlockDesc{}.GetLength(I2);
+
+    static constexpr index_t N0 = BBlockDesc{}.GetLength(I1);
+    static constexpr index_t N1 = BBlockDesc{}.GetLength(I2);
+
+    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerWave, NPerWave, K1>{};
+
+    static constexpr index_t MWaves = M1 / MPerWave;
+    static constexpr index_t NWaves = N1 / NPerWave;
+
+    static constexpr index_t MRepeat = M0;
+    static constexpr index_t NRepeat = N0;
+
+    __device__ constexpr auto GetCLayout() const { return xdlops_gemm.GetCLayout(); }
+
+    __device__ constexpr auto GetNumBlks() const { return xdlops_gemm.GetCLayout().GetNumBlks(); }
+
+    __device__ constexpr auto GetBlkSize() const { return xdlops_gemm.GetCLayout().GetBlkSize(); }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_m  = waveId / NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t m_offset = waveId_m * MPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+        else
+        {
+            const index_t m_offset = waveId_m * MPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_n  = waveId % NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t n_offset = waveId_n * NPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+        else
+        {
+            const index_t n_offset = waveId_n * NPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static CIndex
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+
+        const index_t waveId = get_thread_local_1d_id() / WaveSize;
+
+        const auto thread_mtx_on_blk = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+
+        const index_t waveId_m = waveId / NWaves;
+        const index_t waveId_n = waveId % NWaves;
+
+        const index_t m_offset = m0 * M1 + waveId_m * MPerWave + thread_mtx_on_blk[I0];
+        const index_t n_offset = n0 * N1 + waveId_n * NPerWave + thread_mtx_on_blk[I1];
+
+        return CIndex{m_offset, n_offset};
+    }
+
+    __device__ BlockwiseGemmXdlops_km_kn_m0m1m2n_v1()
+        : a_thread_copy_{CalculateAThreadOriginDataIndex()},
+          b_thread_copy_{CalculateBThreadOriginDataIndex()}
+    {
+        static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ABlockDesc{}.GetLength(I0) == BBlockDesc{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        static_assert(ABlockDesc{}.GetLength(I3) == BBlockDesc{}.GetLength(I3),
+                      "wrong! K1 dimension not consistent");
+
+        static_assert(BlockSize == MWaves * NWaves * WaveSize,
+                      "BlockSize != MWaves * NWaves * WaveSize\n");
+
+        static_assert(K1 == BBlockDesc{}.GetLength(I3), "K1 is wrong!");
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        static_assert(KPerBlock % xdlops_gemm.KPerXdlops == 0, "KPerBlock is wrong!");
+
+        static_assert(K1 % xdlops_gemm.mfma_type.k_base == 0, "K1 is wrong!");
+    }
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAB>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        vector_type<FloatAB, a_thread_desc_.GetElementSpaceSize()> a_thread_vec;
+
+        vector_type<FloatAB, b_thread_desc_.GetElementSpaceSize()> b_thread_vec;
+
+        static_for<0, KPerBlock, xdlops_gemm.KPerXdlops>{}([&](auto k) {
+            // read A
+            a_thread_copy_.Run(ABlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+
+            // read B
+            b_thread_copy_.Run(BBlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               b_thread_buf);
+
+            using mfma_input_type =
+                typename vector_type<FloatAB, xdlops_gemm.mfma_type.k_base>::type;
+
+            static_for<0, a_thread_desc_.GetElementSpaceSize(), 1>{}([&](auto i) {
+                a_thread_vec.template AsType<FloatAB>()(Number<i>{}) = a_thread_buf[Number<i>{}];
+            });
+
+            static_for<0, b_thread_desc_.GetElementSpaceSize(), 1>{}([&](auto i) {
+                b_thread_vec.template AsType<FloatAB>()(Number<i>{}) = b_thread_buf[Number<i>{}];
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                             decltype(b_thread_desc_),
+                                             decltype(c_thread_desc_),
+                                             m0,
+                                             n0>(a_thread_vec.template AsType<mfma_input_type>(),
+                                                 b_thread_vec.template AsType<mfma_input_type>(),
+                                                 c_thread_buf);
+                });
+            });
+        });
+    }
+
+    private:
+    // A[K, M]
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
+
+    // B[K, N]
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
+
+    static constexpr auto c_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         ABlockDesc,
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, MRepeat, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         K1,
+                                                         1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         BBlockDesc,
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, NRepeat, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         K1,
+                                                         1>;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+template <index_t BlockSize,
+          typename FloatAB,
+          class ABlockDesc,
+          class BBlockDesc,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t K1>
+struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
+{
+
+    using CIndex = MultiIndex<2>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto xdlops_gemm = XdlopsGemm<float, MPerWave, NPerWave, K1>{};
+
+    static constexpr index_t WaveSize = 64;
+
+    static constexpr index_t M0 = ABlockDesc{}.GetLength(I1);
+    static constexpr index_t M1 = ABlockDesc{}.GetLength(I2);
+
+    static constexpr index_t N0 = BBlockDesc{}.GetLength(I1);
+    static constexpr index_t N1 = BBlockDesc{}.GetLength(I2);
+
+    static constexpr index_t MWaves = M1 / MPerWave;
+    static constexpr index_t NWaves = N1 / NPerWave;
+
+    static constexpr index_t MRepeat = M0;
+    static constexpr index_t NRepeat = N0;
+
+    __device__ constexpr auto GetCLayout() const { return xdlops_gemm.GetCLayout(); }
+
+    __device__ constexpr auto GetNumBlks() const { return xdlops_gemm.GetCLayout().GetNumBlks(); }
+
+    __device__ constexpr auto GetBlkSize() const { return xdlops_gemm.GetCLayout().GetBlkSize(); }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_m  = waveId / NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t m_offset = waveId_m * MPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+        else
+        {
+            const index_t m_offset = waveId_m * MPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_n  = waveId % NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t n_offset = waveId_n * NPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+        else
+        {
+            const index_t n_offset = waveId_n * NPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static CIndex
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+
+        const index_t waveId = get_thread_local_1d_id() / WaveSize;
+
+        const auto thread_mtx_on_blk = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+
+        const index_t waveId_m = waveId / NWaves;
+        const index_t waveId_n = waveId % NWaves;
+
+        const index_t m_offset = m0 * M1 + waveId_m * MPerWave + thread_mtx_on_blk[I0];
+        const index_t n_offset = n0 * N1 + waveId_n * NPerWave + thread_mtx_on_blk[I1];
+
+        return CIndex{m_offset, n_offset};
+    }
+
+    __device__ BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline()
+        : a_thread_copy_{CalculateAThreadOriginDataIndex()},
+          b_thread_copy_{CalculateBThreadOriginDataIndex()}
+    {
+        static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ABlockDesc{}.GetLength(I0) == BBlockDesc{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        static_assert(ABlockDesc{}.GetLength(I3) == BBlockDesc{}.GetLength(I3),
+                      "wrong! K1 dimension not consistent");
+
+        static_assert(BlockSize == MWaves * NWaves * WaveSize,
+                      "BlockSize != MWaves * NWaves * WaveSize\n");
+
+        static_assert(K1 == BBlockDesc{}.GetLength(I3), "K1 is wrong!");
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        static_assert(KPerBlock % xdlops_gemm.KPerXdlops == 0, "KPerBlock is wrong!");
+
+        static_assert(K1 % xdlops_gemm.mfma_type.k_base == 0, "K1 is wrong!");
+    }
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAB>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        // read A_sub_0
+        a_thread_copy_.Run(ABlockDesc{},
+                           make_tuple(I0, I0, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_,
+                           make_tuple(I0, I0, I0, I0),
+                           a_thread_buf);
+
+        // read B_sub_0
+        b_thread_copy_.Run(BBlockDesc{},
+                           make_tuple(I0, I0, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_,
+                           make_tuple(I0, I0, I0, I0),
+                           b_thread_buf);
+
+        // read B_sub_1
+        b_thread_copy_.Run(BBlockDesc{},
+                           make_tuple(I0, I1, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_,
+                           make_tuple(I0, I1, I0, I0),
+                           b_thread_buf);
+
+        // read A_sub_1
+        a_thread_copy_.Run(ABlockDesc{},
+                           make_tuple(I0, I1, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_,
+                           make_tuple(I0, I1, I0, I0),
+                           a_thread_buf);
+
+        // C_sub_00 += transpose(A_sub_0) * B_sub_0
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 0,
+                                 0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+        // C_sub_01 += transpose(A_sub_0) * B_sub_1
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 0,
+                                 1>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+        static_for<xdlops_gemm.KPerXdlops, KPerBlock, xdlops_gemm.KPerXdlops>{}([&](auto k) {
+            // read A_sub_0
+            a_thread_copy_.Run(ABlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_10 += transpose(A_sub_1) * B_sub_0
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     1,
+                                     0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+            // read B_sub_0
+            b_thread_copy_.Run(BBlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               b_thread_buf);
+
+            // C_sub_11 += transpose(A_sub_1) * B_sub_1
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     1,
+                                     1>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+            // read B_sub_1
+            b_thread_copy_.Run(BBlockDesc{},
+                               make_tuple(k, I1, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, I1, I0, I0),
+                               b_thread_buf);
+
+            // read A_sub_1
+            a_thread_copy_.Run(ABlockDesc{},
+                               make_tuple(k, I1, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I1, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_00 += transpose(A_sub_0) * B_sub_0
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     0,
+                                     0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+            // C_sub_01 += transpose(A_sub_0) * B_sub_1
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     0,
+                                     1>(a_thread_buf, b_thread_buf, c_thread_buf);
+        });
+
+        // C_sub_10 += transpose(A_sub_1) * B_sub_0
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 1,
+                                 0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+        // C_sub_11 += transpose(A_sub_1) * B_sub_1
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 1,
+                                 1>(a_thread_buf, b_thread_buf, c_thread_buf);
+    }
+
+    private:
+    // A[K, M]
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
+
+    // B[K, N]
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
+
+    static constexpr auto c_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         ABlockDesc,
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         1, // K1,
+                                                         1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         BBlockDesc,
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         1, // K1,
+                                                         1>;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
+#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP
+#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "cluster_descriptor.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+template <index_t BlockSize,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          typename BlockSliceLengths,
+          typename ThreadSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool ThreadTransferSrcResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct BlockwiseTensorSliceTransfer_v4
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr BlockwiseTensorSliceTransfer_v4(const SrcDesc& src_desc,
+                                                         const Index& src_block_slice_origin,
+                                                         const DstDesc& dst_desc,
+                                                         const Index& dst_block_slice_origin)
+        : threadwise_transfer_(
+              src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
+
+    {
+        static_assert(nDim == remove_reference_t<remove_cv_t<SrcDesc>>::GetNumOfDimension() &&
+                          nDim == remove_reference_t<remove_cv_t<DstDesc>>::GetNumOfDimension() &&
+                          nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! BlockSize too small");
+
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * ThreadSliceLengths{};
+
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_desc, src_buf, src_step_hacks);
+        }
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_desc, src_buf);
+        }
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunWrite(dst_desc, dst_buf);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+        }
+    }
+
+    // SrcMoveSliceWindowStepHack to control index calculation move slice window
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& step,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(
+                src_desc, step, src_move_slice_window_step_hack);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v3<ThreadSliceLengths,
+                                         DstInMemOp,
+                                         SrcData,
+                                         DstData,
+                                         SrcDesc,
+                                         DstDesc,
+                                         SrcDimAccessOrder,
+                                         DstDimAccessOrder,
+                                         SrcVectorDim,
+                                         DstVectorDim,
+                                         SrcScalarPerVector,
+                                         DstScalarPerVector,
+                                         SrcScalarStrideInVector,
+                                         DstScalarStrideInVector,
+                                         ThreadTransferSrcResetCoordinateAfterRun,
+                                         ThreadTransferDstResetCoordinateAfterRun>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
+#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP
+#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "cluster_descriptor.hpp"
+#include "threadwise_tensor_slice_transfer_v2.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+template <index_t BlockSize,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          typename BlockSliceLengths,
+          typename ThreadSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          typename SrcVectorTensorLengths,
+          typename DstVectorTensorLengths,
+          typename SrcVectorTensorContiguousDimOrder,
+          typename DstVectorTensorContiguousDimOrder,
+          bool ThreadTransferSrcResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct BlockwiseTensorSliceTransfer_v4r1
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr BlockwiseTensorSliceTransfer_v4r1(const SrcDesc& src_desc,
+                                                           const Index& src_block_slice_origin,
+                                                           const DstDesc& dst_desc,
+                                                           const Index& dst_block_slice_origin)
+        : threadwise_transfer_(
+              src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
+
+    {
+        static_assert(nDim == remove_reference_t<remove_cv_t<SrcDesc>>::GetNumOfDimension() &&
+                          nDim == remove_reference_t<remove_cv_t<DstDesc>>::GetNumOfDimension() &&
+                          nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! BlockSize too small");
+
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * ThreadSliceLengths{};
+
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_desc, src_buf, src_step_hacks);
+        }
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunWrite(dst_desc, dst_buf);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+        }
+    }
+
+    // SrcMoveSliceWindowStepHack to control index calculation move slice window
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& step,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(
+                src_desc, step, src_move_slice_window_step_hack);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v3r1<ThreadSliceLengths,
+                                           DstInMemOp,
+                                           SrcData,
+                                           DstData,
+                                           SrcDesc,
+                                           DstDesc,
+                                           SrcDimAccessOrder,
+                                           DstDimAccessOrder,
+                                           SrcVectorTensorLengths,
+                                           DstVectorTensorLengths,
+                                           SrcVectorTensorContiguousDimOrder,
+                                           DstVectorTensorContiguousDimOrder,
+                                           ThreadTransferSrcResetCoordinateAfterRun,
+                                           ThreadTransferDstResetCoordinateAfterRun>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
+#ifndef CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
+#define CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_dlops_v2r3.hpp"
+#include "blockwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
+
+namespace ck {
+
+template <typename GridwiseContraction,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_GK0_GM0_GM10_GM11_GK1,
+          typename BGridDesc_GK0_GN0_GN10_GN11_GK1,
+          typename CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1,
+          typename CGridBlockCluster_BlockId_To_GM10_GN10,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_contraction_dlops_v1r2(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AGridDesc_GK0_GM0_GM10_GM11_GK1 a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+            const BGridDesc_GK0_GN0_GN10_GN11_GK1 b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+            const CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+            const CGridBlockCluster_BlockId_To_GM10_GN10 c_grid_block_cluster_blockid_to_gm10_gn10)
+{
+    constexpr index_t shared_block_size =
+        GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseContraction::Run(p_a_grid,
+                             p_b_grid,
+                             p_c_grid,
+                             p_shared_block,
+                             a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                             b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                             c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                             c_grid_block_cluster_blockid_to_gm10_gn10,
+                             integral_constant<bool, HasMainKBlockLoop>{},
+                             integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AGridDesc_GK0_GM0_GM1_GK1,
+          typename BGridDesc_GK0_GN0_GN1_GK1,
+          typename CGridDesc_GM0_GM1_GN0_GN1,
+          index_t GM1PerBlockGM11,
+          index_t GN1PerBlockGN11,
+          index_t GK0PerBlock,
+          index_t BM1PerThreadBM11,
+          index_t BN1PerThreadBN11,
+          index_t BK0PerThread,
+          typename BM10BN10ThreadClusterBM10Xs,
+          typename BM10BN10ThreadClusterBN10Xs,
+          typename ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
+struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    // GM0 and GN0 need to known at compile-time
+    static constexpr auto GM0 = CGridDesc_GM0_GM1_GN0_GN1{}.GetLength(I0);
+    static constexpr auto GN0 = CGridDesc_GM0_GM1_GN0_GN1{}.GetLength(I2);
+    static constexpr auto GK1 = AGridDesc_GK0_GM0_GM1_GK1{}.GetLength(I3);
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // lds max alignment
+        // TODO: part of them should be moved into blockwise-gemm
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = GK1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
+            max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
+            max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
+            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
+            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
+                  const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
+                  const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(GM0)>>::value &&
+                          is_known_at_compile_time<remove_cv_t<decltype(GN0)>>::value,
+                      "wrong! GM0 and GN0 need to be known at compile-time");
+
+        const auto GM1 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2);
+        const auto GN1 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2);
+        const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (
+            (GM0 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I0) &&
+             GM1 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1) &&
+             GN0 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I2) &&
+             GN1 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3) &&
+             GM0 == a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I1) &&
+             GM1 == a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2) &&
+             GN0 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I1) &&
+             GN1 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2) &&
+             GK0 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I0) &&
+             GK1 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I3)) &&
+            (GM1 % GM1PerBlockGM11 == 0 && GN1 % GN1PerBlockGN11 == 0 && GK0 % GK0PerBlock == 0));
+    }
+
+    __host__ __device__ static constexpr index_t
+    CalculateGridSize(const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
+        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
+
+        constexpr index_t GM11 = GM1PerBlockGM11;
+        constexpr index_t GN11 = GN1PerBlockGN11;
+
+        const index_t GM10 = GM1 / GM11;
+        const index_t GN10 = GN1 / GN11;
+
+        const index_t grid_size = GM10 * GN10;
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t GK0)
+    {
+        const bool has_main_k_block_loop = (GK0 + GK0PerBlock) / (2 * GK0PerBlock) > 1;
+
+        return has_main_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t GK0)
+    {
+        const bool has_double_tail_k_block_loop = (GK0 / GK0PerBlock) % 2 == 0;
+
+        return has_double_tail_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
+        const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1)
+    {
+        const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
+        const auto GM1 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2);
+
+        const auto GM11 = Number<GM1PerBlockGM11>{};
+        const auto GM10 = GM1 / GM11;
+
+        const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = transform_tensor_descriptor(
+            a_grid_desc_gk0_gm0_gm1_gk1,
+            make_tuple(make_pass_through_transform(GK0),
+                       make_pass_through_transform(GM0),
+                       make_unmerge_transform(make_tuple(GM10, GM11)),
+                       make_pass_through_transform(GK1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+
+        return a_grid_desc_gk0_gm0_gm10_gm11_gk1;
+    }
+
+    __host__ __device__ static constexpr auto MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
+        const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1)
+    {
+        const auto GK0 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I0);
+        const auto GN1 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2);
+
+        const auto GN11 = Number<GN1PerBlockGN11>{};
+        const auto GN10 = GN1 / GN11;
+
+        const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = transform_tensor_descriptor(
+            b_grid_desc_gk0_gn0_gn1_gk1,
+            make_tuple(make_pass_through_transform(GK0),
+                       make_pass_through_transform(GN0),
+                       make_unmerge_transform(make_tuple(GN10, GN11)),
+                       make_pass_through_transform(GK1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+
+        return b_grid_desc_gk0_gn0_gn10_gn11_gk1;
+    }
+
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
+        const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
+        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
+
+        constexpr auto GM11 = Number<GM1PerBlockGM11>{};
+        constexpr auto GN11 = Number<GN1PerBlockGN11>{};
+
+        const auto GM10 = GM1 / GM11;
+        const auto GN10 = GN1 / GN11;
+
+        constexpr auto BM = GM0 * GM11;
+        constexpr auto BN = GN0 * GN11;
+
+        constexpr auto BM1 =
+            Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies{}, I1) *
+                   BM1PerThreadBM11>{};
+        constexpr auto BN1 =
+            Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies{}, I1) *
+                   BN1PerThreadBN11>{};
+
+        constexpr auto BM0 = BM / BM1;
+        constexpr auto BN0 = BN / BN1;
+
+        const auto c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc = transform_tensor_descriptor(
+            c_grid_desc_gm0_gm1_gn0_gn1,
+            make_tuple(make_pass_through_transform(GM0),
+                       make_unmerge_transform(make_tuple(GM10, GM11)),
+                       make_pass_through_transform(GN0),
+                       make_unmerge_transform(make_tuple(GN10, GN11))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}));
+
+        const auto c_gm10_bm_gn10_bn_grid_desc = transform_tensor_descriptor(
+            c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc,
+            make_tuple(make_pass_through_transform(GM10),
+                       make_merge_transform(make_tuple(GM0, GM11)),
+                       make_pass_through_transform(GN10),
+                       make_merge_transform(make_tuple(GN0, GN11))),
+            make_tuple(Sequence<1>{}, Sequence<0, 2>{}, Sequence<4>{}, Sequence<3, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = transform_tensor_descriptor(
+            c_gm10_bm_gn10_bn_grid_desc,
+            make_tuple(make_pass_through_transform(GM10),
+                       make_unmerge_transform(make_tuple(BM0, BM1)),
+                       make_pass_through_transform(GN10),
+                       make_unmerge_transform(make_tuple(BN0, BN1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}));
+
+        return c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1;
+    }
+
+    __host__ __device__ static constexpr auto MakeCGridBlockCluster_BlockId_To_GM10_GN10(
+        const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
+        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
+
+        constexpr auto GM11 = Number<GM1PerBlockGM11>{};
+        constexpr auto GN11 = Number<GN1PerBlockGN11>{};
+
+        const auto GM10 = GM1 / GM11;
+        const auto GN10 = GN1 / GN11;
+
+        const auto c_grid_block_cluster_blockid_to_gm10_gn10 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(GM10, GN10))),
+            make_tuple(Sequence<0, 1>{}),
+            make_tuple(Sequence<0>{}));
+
+        return c_grid_block_cluster_blockid_to_gm10_gn10;
+    }
+
+    using AGridDesc_GK0_GM0_GM10_GM11_GK1 =
+        decltype(MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(AGridDesc_GK0_GM0_GM1_GK1{}));
+    using BGridDesc_GK0_GN0_GN10_GN11_GK1 =
+        decltype(MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(BGridDesc_GK0_GN0_GN1_GK1{}));
+    using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 =
+        decltype(MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(CGridDesc_GM0_GM1_GN0_GN1{}));
+    using CGridBlockCluster_BlockId_To_GM10_GN10 =
+        decltype(MakeCGridBlockCluster_BlockId_To_GM10_GN10(CGridDesc_GM0_GM1_GN0_GN1{}));
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        FloatAB* __restrict__ p_shared_block,
+        const AGridDesc_GK0_GM0_GM10_GM11_GK1& a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+        const BGridDesc_GK0_GN0_GN10_GN11_GK1& b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+        const CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1& c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+        const CGridBlockCluster_BlockId_To_GM10_GN10& c_grid_block_cluster_blockid_to_gm10_gn10,
+        integral_constant<bool, HasMainKBlockLoop>,
+        integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_grid, a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_grid, b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_grid, c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetElementSpaceSize());
+
+        const auto GK0 = a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I0);
+
+        // divide block work by [GM10, GN10]
+        const auto c_gm10_gn10_block_cluster_idx =
+            c_grid_block_cluster_blockid_to_gm10_gn10.CalculateBottomIndex(
+                make_multi_index(get_block_1d_id()));
+
+        // HACK: this force index data into SGPR
+        const index_t igm10 = __builtin_amdgcn_readfirstlane(c_gm10_gn10_block_cluster_idx[I0]);
+        const index_t ign10 = __builtin_amdgcn_readfirstlane(c_gm10_gn10_block_cluster_idx[I1]);
+
+        // lds max alignment
+        // TODO: part of them should be moved into blockwise-gemm
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = GK1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
+            max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
+            max_lds_align);
+
+        // A matrix in LDS memory for blockwise GEMM
+        //   be careful of LDS alignment
+        constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align);
+
+        // B matrix in LDS memory for blockwise GEMM
+        //   be careful of LDS alignment
+        constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align);
+
+        static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() ==
+                              a_block_desc_gk0_bm_gk1.GetElementSpaceSize() &&
+                          b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize() ==
+                              b_block_desc_gk0_bn_gk1.GetElementSpaceSize(),
+                      "wrong!");
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
+            BlockSize,
+            InMemoryDataOperationEnum_t::Set,
+            Sequence<GK0PerBlock, GM0, 1, GM1PerBlockGM11, GK1.value>,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(a_grid_desc_gk0_gm0_gm10_gm11_gk1),
+            decltype(a_block_desc_gk0_gm0_gm10_gm11_gk1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3, 4>,
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, // SrcVectorTensorLengths
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, // DstVectorTensorLengths
+            ABlockTransferSrcVectorTensorContiguousDimOrder, // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3, 4>,                         // DstVectorTensorContiguousDimOrder
+            false,
+            true>(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                  make_multi_index(0, 0, igm10, 0, 0),
+                  a_block_desc_gk0_gm0_gm10_gm11_gk1,
+                  make_multi_index(0, 0, 0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
+            BlockSize,
+            InMemoryDataOperationEnum_t::Set,
+            Sequence<GK0PerBlock, GN0, 1, GN1PerBlockGN11, GK1.value>,
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(b_grid_desc_gk0_gn0_gn10_gn11_gk1),
+            decltype(b_block_desc_gk0_gn0_gn10_gn11_gk1),
+            BBlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3, 4>,
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, // SrcVectorTensorLengths
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, // DstVectorTensorLengths
+            BBlockTransferSrcVectorTensorContiguousDimOrder, // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3, 4>,                         // DstVectorTensorContiguousDimOrder
+            false,
+            true>(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                  make_multi_index(0, 0, ign10, 0, 0),
+                  b_block_desc_gk0_gn0_gn10_gn11_gk1,
+                  make_multi_index(0, 0, 0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[GK0PerBlock, GM1PerBlockGM11] is in LDS
+        //     b_mtx[KPerBlocl, GN1PerBlockGN11] is in LDS
+        //     c_mtx[GM1PerBlockGM11, GN1PerBlockGN11] is distributed among threads, and saved in
+        //       register
+        const auto blockwise_gemm =
+            BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2<
+                BlockSize,
+                FloatAB,
+                FloatAB,
+                FloatAcc,
+                decltype(a_block_desc_gk0_bm_gk1),
+                decltype(b_block_desc_gk0_bn_gk1),
+                BM1PerThreadBM11,
+                BN1PerThreadBN11,
+                BK0PerThread,
+                BM10BN10ThreadClusterBM10Xs,
+                BM10BN10ThreadClusterBN10Xs,
+                BM1PerThreadBM11,
+                BN1PerThreadBN11>{};
+
+        constexpr auto c_thread_tensor_lengths_bm0_bm1_bn0_bn1 =
+            decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
+
+        constexpr auto c_thread_desc_bm0_bm1_bn0_bn1 = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_thread_tensor_lengths_bm0_bm1_bn0_bn1));
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
+            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
+            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block_double = p_shared_block;
+        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
+
+        // register allocation for output
+        auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
+            c_thread_desc_bm0_bm1_bn0_bn1.GetElementSpaceSize());
+
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_thread_desc_bm0_bm1_bn0_bn1),
+                                    decltype(c_thread_tensor_lengths_bm0_bm1_bn0_bn1)>{}
+            .Run(c_thread_desc_bm0_bm1_bn0_bn1,
+                 make_tuple(I0, I0, I0, I0),
+                 c_thread_buf,
+                 FloatAcc{0});
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(GK0PerBlock, 0, 0, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(GK0PerBlock, 0, 0, 0, 0);
+
+        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double, a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
+        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double, b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
+
+        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double + a_block_aligned_space_size,
+            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
+        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double + b_block_aligned_space_size,
+            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(
+                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
+            b_blockwise_copy.RunRead(
+                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
+
+            a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_even_buf);
+            b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_even_buf);
+        }
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t gk0_block_on_grid = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                                    a_block_slice_copy_step,
+                                                    AGridMoveSliceWindowStepHacks{});
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                                    b_block_slice_copy_step,
+                                                    BGridMoveSliceWindowStepHacks{});
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
+                b_blockwise_copy.RunRead(
+                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(c_thread_desc_bm0_bm1_bn0_bn1,
+                                   a_block_even_buf,
+                                   b_block_even_buf,
+                                   c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_odd_buf);
+                b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_odd_buf);
+
+                // odd iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                                    a_block_slice_copy_step,
+                                                    AGridMoveSliceWindowStepHacks{});
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                                    b_block_slice_copy_step,
+                                                    BGridMoveSliceWindowStepHacks{});
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
+                b_blockwise_copy.RunRead(
+                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    c_thread_desc_bm0_bm1_bn0_bn1, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_even_buf);
+                b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_even_buf);
+
+                gk0_block_on_grid += 2 * GK0PerBlock;
+            } while(gk0_block_on_grid < GK0 - 2 * GK0PerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                                a_block_slice_copy_step,
+                                                AGridMoveSliceWindowStepHacks{});
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                                b_block_slice_copy_step,
+                                                BGridMoveSliceWindowStepHacks{});
+
+            __syncthreads();
+
+            // LDS double buffer: load last data from device mem
+            a_blockwise_copy.RunRead(
+                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
+            b_blockwise_copy.RunRead(
+                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(
+                c_thread_desc_bm0_bm1_bn0_bn1, a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+            // LDS double buffer: store last data to LDS
+            a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_odd_buf);
+            b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_odd_buf);
+
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_thread_desc_bm0_bm1_bn0_bn1, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_thread_desc_bm0_bm1_bn0_bn1, a_block_even_buf, b_block_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr auto c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1 =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1,
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I0]>{},
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I1]>{},
+                               I1,
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I2]>{},
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I3]>{}));
+
+            const auto c_thread_origin_on_block_bm0_bm1_bn0_bn1 =
+                blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+                    get_thread_local_1d_id());
+
+            ThreadwiseTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1),
+                decltype(c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1),
+                Sequence<1,
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I0],
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I1],
+                         1,
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I2],
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I3]>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                false>{c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                       make_multi_index(igm10,
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I0],
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I1],
+                                        ign10,
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I2],
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I3])}
+                .Run(c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                     make_tuple(I0, I0, I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                     c_grid_buf,
+                     CGridStepHacks{});
+        }
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
+#ifndef CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
+#define CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_dlops_v2r2.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
+
+namespace ck {
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AKM0M1GridDesc,
+          typename BKN0N1GridDesc,
+          typename CM0M10M11N0N10N11GridDesc,
+          typename CBlockIdToM0N0BlockClusterAdaptor,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v1r2(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AKM0M1GridDesc a_k_m0_m1_grid_desc,
+            const BKN0N1GridDesc b_k_n0_n1_grid_desc,
+            const CM0M10M11N0N10N11GridDesc c_m0_m10_m11_n0_n10_n11_grid_desc,
+            const CBlockIdToM0N0BlockClusterAdaptor c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k_m0_m1_grid_desc,
+                      b_k_n0_n1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+// pass tensor descriptor by CONSTANT void pointer
+// CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to
+// non-modifiable parameter address space, so compiler can enable corresponding optimization
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AKM0M1GridDesc,
+          typename BKN0N1GridDesc,
+          typename CM0M10M11N0N10N11GridDesc,
+          typename CBlockIdToM0N0BlockClusterAdaptor,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v1r2(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               const void CONSTANT* p_a_k_m0_m1_grid_desc,
+                               const void CONSTANT* p_b_k_n0_n1_grid_desc,
+                               const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+                               const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    // first cast void CONSTANT void* to void*
+    // second cast void* to Desc*
+    // the copy constructor of tensor descriptor doesn't take address_space(4)
+    const auto a_k_m0_m1_grid_desc = *reinterpret_cast<const AKM0M1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_a_k_m0_m1_grid_desc));
+    const auto b_k_n0_n1_grid_desc = *reinterpret_cast<const BKN0N1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_b_k_n0_n1_grid_desc));
+    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
+            cast_pointer_to_generic_address_space(p_c_m0_m10_m11_n0_n10_n11_grid_desc));
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            cast_pointer_to_generic_address_space(p_c_blockid_to_m0_n0_block_cluster_adaptor));
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k_m0_m1_grid_desc,
+                      b_k_n0_n1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+#endif
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AKMGridDesc,
+          typename BKNGridDesc,
+          typename CMNGridDesc,
+          index_t MPerBlockM1,
+          index_t NPerBlockN1,
+          index_t KPerBlock,
+          index_t M1PerThreadM111,
+          index_t N1PerThreadN111,
+          index_t KPerThread,
+          index_t M11N11ThreadClusterM1100,
+          index_t M11N11ThreadClusterN1100,
+          index_t M11N11ThreadClusterM1101,
+          index_t M11N11ThreadClusterN1101,
+          typename ABlockTransferThreadSliceLengths_K_M0_M1,
+          typename ABlockTransferThreadClusterLengths_K_M0_M1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_M1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K_N0_N1,
+          typename BBlockTransferThreadClusterLengths_K_N0_N1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_N1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
+struct GridwiseGemmDlops_km_kn_mn_v1r2
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = math::lcm(Number<ABlockTransferDstScalarPerVector_M1>{},
+                                                 Number<BBlockTransferDstScalarPerVector_N1>{},
+                                                 Number<M1PerThreadM111>{},
+                                                 Number<N1PerThreadN111>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size =
+            math::integer_least_multiple(a_k_m_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size =
+            math::integer_least_multiple(b_k_n_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool CheckValidity(const AKMGridDesc& a_k_m_grid_desc,
+                                                            const BKNGridDesc& b_k_n_grid_desc,
+                                                            const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = a_k_m_grid_desc.GetLength(I1);
+        const auto N = b_k_n_grid_desc.GetLength(I1);
+        const auto K = a_k_m_grid_desc.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+                K == b_k_n_grid_desc.GetLength(I0)) &&
+               (M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K % KPerBlock == 0);
+    }
+
+    __host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t grid_size = (M / MPerBlockM1) * (N / NPerBlockN1);
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const bool has_main_k_block_loop = (K + KPerBlock) / (2 * KPerBlock) > 1;
+
+        return has_main_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t K)
+    {
+        const bool has_double_tail_k_block_loop = (K / KPerBlock) % 2 == 0;
+
+        return has_double_tail_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeAKM0M1GridDescriptor(const AKMGridDesc& a_k_m_grid_desc)
+    {
+        const auto K = a_k_m_grid_desc.GetLength(I0);
+        const auto M = a_k_m_grid_desc.GetLength(I1);
+
+        const auto M1 = Number<MPerBlockM1>{};
+        const auto M0 = M / M1;
+
+        const auto a_k_m0_m1_grid_desc = transform_tensor_descriptor(
+            a_k_m_grid_desc,
+            make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(M0, M1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return a_k_m0_m1_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBKN0N1GridDescriptor(const BKNGridDesc& b_k_n_grid_desc)
+    {
+        const auto K = b_k_n_grid_desc.GetLength(I0);
+        const auto N = b_k_n_grid_desc.GetLength(I1);
+
+        const auto N1 = Number<NPerBlockN1>{};
+        const auto N0 = N / N1;
+
+        const auto b_k_n0_n1_grid_desc = transform_tensor_descriptor(
+            b_k_n_grid_desc,
+            make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(N0, N1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return b_k_n0_n1_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M10M11N0N10N11GridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlockM1>{};
+        constexpr auto N1 = Number<NPerBlockN1>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        constexpr auto M11 =
+            Number<M11N11ThreadClusterM1100 * M11N11ThreadClusterM1101 * M1PerThreadM111>{};
+        constexpr auto N11 =
+            Number<M11N11ThreadClusterN1100 * M11N11ThreadClusterN1101 * N1PerThreadN111>{};
+
+        constexpr auto M10 = M1 / M11;
+        constexpr auto N10 = N1 / N11;
+
+        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_tensor_descriptor(
+            c_m_n_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
+                       make_unmerge_transform(make_tuple(N0, N10, N11))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return c_m0_m10_m11_n0_n10_n11_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockIdToM0N0BlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlockM1>{};
+        constexpr auto N1 = Number<NPerBlockN1>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))),
+                                             make_tuple(Sequence<0, 1>{}),
+                                             make_tuple(Sequence<0>{}));
+
+        return c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    using AKM0M1GridDesc            = decltype(MakeAKM0M1GridDescriptor(AKMGridDesc{}));
+    using BKN0N1GridDesc            = decltype(MakeBKN0N1GridDescriptor(BKNGridDesc{}));
+    using CM0M10M11N0N10N11GridDesc = decltype(MakeCM0M10M11N0N10N11GridDescriptor(CMNGridDesc{}));
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(MakeCBlockIdToM0N0BlockClusterAdaptor(CMNGridDesc{}));
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        FloatAB* __restrict__ p_shared_block,
+        const AKM0M1GridDesc& a_k_m0_m1_grid_desc,
+        const BKN0N1GridDesc& b_k_n0_n1_grid_desc,
+        const CM0M10M11N0N10N11GridDesc& c_m0_m10_m11_n0_n10_n11_grid_desc,
+        const CBlockIdToM0N0BlockClusterAdaptor& c_blockid_to_m0_n0_block_cluster_adaptor,
+        integral_constant<bool, HasMainKBlockLoop>,
+        integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_grid, a_k_m0_m1_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_grid, b_k_n0_n1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_grid, c_m0_m10_m11_n0_n10_n11_grid_desc.GetElementSpaceSize());
+
+        const auto K = a_k_m0_m1_grid_desc.GetLength(I0);
+
+        // divide block work by [M, N]
+        const auto c_m0_n0_block_cluster_idx =
+            c_blockid_to_m0_n0_block_cluster_adaptor.CalculateBottomIndex(
+                make_multi_index(get_block_1d_id()));
+
+        // HACK: this force index data into SGPR
+        const index_t im0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I0]);
+        const index_t in0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I1]);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(Number<ABlockTransferDstScalarPerVector_M1>{},
+                                                 Number<BBlockTransferDstScalarPerVector_N1>{},
+                                                 Number<M1PerThreadM111>{},
+                                                 Number<N1PerThreadN111>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align);
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, 1, MPerBlockM1>,
+                                            ABlockTransferThreadSliceLengths_K_M0_M1,
+                                            ABlockTransferThreadClusterLengths_K_M0_M1,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_k_m0_m1_grid_desc),
+                                            decltype(a_k_m0_m1_block_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<0, 1, 2>,
+                                            ABlockTransferSrcVectorDim,
+                                            2,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_M1,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(a_k_m0_m1_grid_desc,
+                                                  make_multi_index(0, im0, 0),
+                                                  a_k_m0_m1_block_desc,
+                                                  make_multi_index(0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, 1, NPerBlockN1>,
+                                            BBlockTransferThreadSliceLengths_K_N0_N1,
+                                            BBlockTransferThreadClusterLengths_K_N0_N1,
+                                            BBlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(b_k_n0_n1_grid_desc),
+                                            decltype(b_k_n0_n1_block_desc),
+                                            BBlockTransferSrcAccessOrder,
+                                            Sequence<0, 1, 2>,
+                                            BBlockTransferSrcVectorDim,
+                                            2,
+                                            BBlockTransferSrcScalarPerVector,
+                                            BBlockTransferDstScalarPerVector_N1,
+                                            1,
+                                            1,
+                                            BThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(b_k_n0_n1_grid_desc,
+                                                  make_multi_index(0, in0, 0),
+                                                  b_k_n0_n1_block_desc,
+                                                  make_multi_index(0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[KPerBlock, MPerBlockM1] is in LDS
+        //     b_mtx[KPerBlocl, NPerBlockN1] is in LDS
+        //     c_mtx[MPerBlockM1, NPerBlockN1] is distributed among threads, and saved in
+        //       register
+        const auto blockwise_gemm =
+            BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2<BlockSize,
+                                                                FloatAB,
+                                                                FloatAB,
+                                                                FloatAcc,
+                                                                decltype(a_k_m_block_desc),
+                                                                decltype(b_k_n_block_desc),
+                                                                M1PerThreadM111,
+                                                                N1PerThreadN111,
+                                                                KPerThread,
+                                                                M11N11ThreadClusterM1100,
+                                                                M11N11ThreadClusterN1100,
+                                                                M11N11ThreadClusterM1101,
+                                                                M11N11ThreadClusterN1101,
+                                                                M1PerThreadM111,
+                                                                N1PerThreadN111>{};
+        constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
+            decltype(blockwise_gemm)::GetCM0M1N0N1ThreadTensorLengths();
+
+        constexpr auto c_m10_m11_n10_n11_thread_desc = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size =
+            math::integer_least_multiple(a_k_m0_m1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size =
+            math::integer_least_multiple(b_k_n0_n1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block_double = p_shared_block;
+        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
+
+        // register allocation for output
+        auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
+            c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize());
+
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_m10_m11_n10_n11_thread_desc),
+                                    decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
+            .Run(c_m10_m11_n10_n11_thread_desc,
+                 make_tuple(I0, I0, I0, I0),
+                 c_thread_buf,
+                 FloatAcc{0});
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_k_m0_m1_global_step_hacks = AGridStepHacks{};
+        constexpr auto b_k_n0_n1_global_step_hacks = BGridStepHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_k_m0_m1_global_move_slice_window_step_hack =
+            AGridMoveSliceWindowStepHacks{};
+        constexpr auto b_k_n0_n1_global_move_slice_window_step_hack =
+            BGridMoveSliceWindowStepHacks{};
+
+        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double, a_k_m0_m1_block_desc.GetElementSpaceSize());
+        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double, b_k_n0_n1_block_desc.GetElementSpaceSize());
+
+        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double + a_block_aligned_space_size,
+            a_k_m0_m1_block_desc.GetElementSpaceSize());
+        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double + b_block_aligned_space_size,
+            b_k_n0_n1_block_desc.GetElementSpaceSize());
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(
+                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
+            b_blockwise_copy.RunRead(
+                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
+
+            a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_even_buf);
+            b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_even_buf);
+        }
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t k_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    a_k_m0_m1_global_move_slice_window_step_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    b_k_n0_n1_global_move_slice_window_step_hack);
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
+                b_blockwise_copy.RunRead(
+                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(c_m10_m11_n10_n11_thread_desc,
+                                   a_block_even_buf,
+                                   b_block_even_buf,
+                                   c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_odd_buf);
+                b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_odd_buf);
+
+                // odd iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    a_k_m0_m1_global_move_slice_window_step_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    b_k_n0_n1_global_move_slice_window_step_hack);
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
+                b_blockwise_copy.RunRead(
+                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_even_buf);
+                b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_even_buf);
+
+                k_block_data_begin += 2 * KPerBlock;
+            } while(k_block_data_begin < K - 2 * KPerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
+                                                a_block_slice_copy_step,
+                                                a_k_m0_m1_global_move_slice_window_step_hack);
+            b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
+                                                b_block_slice_copy_step,
+                                                b_k_n0_n1_global_move_slice_window_step_hack);
+
+            __syncthreads();
+
+            // LDS double buffer: load last data from device mem
+            a_blockwise_copy.RunRead(
+                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
+            b_blockwise_copy.RunRead(
+                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+            // LDS double buffer: store last data to LDS
+            a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_odd_buf);
+            b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_odd_buf);
+
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
+                               I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I2]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I3]>{}));
+
+            const auto c_m10_m11_n10_n11_thread_origin_idx_on_block =
+                blockwise_gemm.CalculateCM0M1N0N1ThreadOriginOnBlock(get_thread_local_1d_id());
+
+            ThreadwiseTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_m0_m10_m11_n0_n10_n11_thread_desc),
+                decltype(c_m0_m10_m11_n0_n10_n11_grid_desc),
+                Sequence<1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I0],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I1],
+                         1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I2],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I3]>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>{c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      make_multi_index(im0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I0],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I1],
+                                       in0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I2],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I3])}
+                .Run(c_m0_m10_m11_n0_n10_n11_thread_desc,
+                     make_tuple(I0, I0, I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_m0_m10_m11_n0_n10_n11_grid_desc,
+                     c_grid_buf,
+                     CGridStepHacks{});
+        }
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
+#ifndef CK_GRIDWISE_GEMM_V1R3_HPP
+#define CK_GRIDWISE_GEMM_V1R3_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_dlops_v2r3.hpp"
+#include "blockwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_set.hpp"
+
+namespace ck {
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AK0M0M1K1GridDesc,
+          typename BK0N0N1K1GridDesc,
+          typename CM0M10M11N0N10N11GridDesc,
+          typename CBlockIdToM0N0BlockClusterAdaptor,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v1r3(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AK0M0M1K1GridDesc a_k0_m0_m1_k1_grid_desc,
+            const BK0N0N1K1GridDesc b_k0_n0_n1_k1_grid_desc,
+            const CM0M10M11N0N10N11GridDesc c_m0_m10_m11_n0_n10_n11_grid_desc,
+            const CBlockIdToM0N0BlockClusterAdaptor c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m0_m1_k1_grid_desc,
+                      b_k0_n0_n1_k1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+// pass tensor descriptor by CONSTANT void pointer
+// CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to
+// non-modifiable parameter address space, so compiler can enable corresponding optimization
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AK0M0M1K1GridDesc,
+          typename BK0N0N1K1GridDesc,
+          typename CM0M10M11N0N10N11GridDesc,
+          typename CBlockIdToM0N0BlockClusterAdaptor,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v1r3(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               const void CONSTANT* p_a_k0_m0_m1_k1_grid_desc,
+                               const void CONSTANT* p_b_k0_n0_n1_k1_grid_desc,
+                               const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+                               const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    // first cast void CONSTANT void* to void*
+    // second cast void* to Desc*
+    // the copy constructor of tensor descriptor doesn't take address_space(4)
+    const auto a_k0_m0_m1_k1_grid_desc = *reinterpret_cast<const AK0M0M1K1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_a_k0_m0_m1_k1_grid_desc));
+    const auto b_k0_n0_n1_k1_grid_desc = *reinterpret_cast<const BK0N0N1K1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_b_k0_n0_n1_k1_grid_desc));
+    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
+            cast_pointer_to_generic_address_space(p_c_m0_m10_m11_n0_n10_n11_grid_desc));
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            cast_pointer_to_generic_address_space(p_c_blockid_to_m0_n0_block_cluster_adaptor));
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m0_m1_k1_grid_desc,
+                      b_k0_n0_n1_k1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+#endif
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          index_t MPerBlockM1,
+          index_t NPerBlockN1,
+          index_t KPerBlock,
+          index_t M1PerThreadM111,
+          index_t N1PerThreadN111,
+          index_t KPerThread,
+          typename M11N11ThreadClusterM110Xs,
+          typename M11N11ThreadClusterN110Xs,
+          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
+struct GridwiseGemmDlops_km_kn_mn_v1r3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = AK0MK1GridDesc{}.GetLength(I2);
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = K1;
+
+        // TODO: check alignment
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size =
+            math::integer_least_multiple(a_k_m_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size =
+            math::integer_least_multiple(b_k_n_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                  const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                  const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+                K0 == b_k0_n_k1_grid_desc.GetLength(I0) &&
+                K1 == a_k0_m_k1_grid_desc.GetLength(I2) &&
+                K1 == b_k0_n_k1_grid_desc.GetLength(I2)) &&
+               (M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K0 % KPerBlock == 0);
+    }
+
+    __host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t grid_size = (M / MPerBlockM1) * (N / NPerBlockN1);
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K0)
+    {
+        const bool has_main_k_block_loop = (K0 + KPerBlock) / (2 * KPerBlock) > 1;
+
+        return has_main_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t K0)
+    {
+        const bool has_double_tail_k_block_loop = (K0 / KPerBlock) % 2 == 0;
+
+        return has_double_tail_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeAK0M0M1K1GridDescriptor(const AK0MK1GridDesc& a_k0_m_k1_grid_desc)
+    {
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+
+        const auto M1 = Number<MPerBlockM1>{};
+        const auto M0 = M / M1;
+
+        const auto a_k0_m0_m1_k1_grid_desc =
+            transform_tensor_descriptor(a_k0_m_k1_grid_desc,
+                                        make_tuple(make_pass_through_transform(K0),
+                                                   make_unmerge_transform(make_tuple(M0, M1)),
+                                                   make_pass_through_transform(K1)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return a_k0_m0_m1_k1_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBK0N0N1K1GridDescriptor(const BK0NK1GridDesc& b_k0_n_k1_grid_desc)
+    {
+        const auto K0 = b_k0_n_k1_grid_desc.GetLength(I0);
+        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+
+        const auto N1 = Number<NPerBlockN1>{};
+        const auto N0 = N / N1;
+
+        const auto b_k0_n0_n1_k1_grid_desc =
+            transform_tensor_descriptor(b_k0_n_k1_grid_desc,
+                                        make_tuple(make_pass_through_transform(K0),
+                                                   make_unmerge_transform(make_tuple(N0, N1)),
+                                                   make_pass_through_transform(K1)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return b_k0_n0_n1_k1_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M10M11N0N10N11GridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlockM1>{};
+        constexpr auto N1 = Number<NPerBlockN1>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        constexpr auto M11 =
+            Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies{}, I1) *
+                   M1PerThreadM111>{};
+        constexpr auto N11 =
+            Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies{}, I1) *
+                   N1PerThreadN111>{};
+
+        constexpr auto M10 = M1 / M11;
+        constexpr auto N10 = N1 / N11;
+
+        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_tensor_descriptor(
+            c_m_n_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
+                       make_unmerge_transform(make_tuple(N0, N10, N11))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return c_m0_m10_m11_n0_n10_n11_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockIdToM0N0BlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlockM1>{};
+        constexpr auto N1 = Number<NPerBlockN1>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))),
+                                             make_tuple(Sequence<0, 1>{}),
+                                             make_tuple(Sequence<0>{}));
+
+        return c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    using AK0M0M1K1GridDesc         = decltype(MakeAK0M0M1K1GridDescriptor(AK0MK1GridDesc{}));
+    using BK0N0N1K1GridDesc         = decltype(MakeBK0N0N1K1GridDescriptor(BK0NK1GridDesc{}));
+    using CM0M10M11N0N10N11GridDesc = decltype(MakeCM0M10M11N0N10N11GridDescriptor(CMNGridDesc{}));
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(MakeCBlockIdToM0N0BlockClusterAdaptor(CMNGridDesc{}));
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        FloatAB* __restrict__ p_shared_block,
+        const AK0M0M1K1GridDesc& a_k0_m0_m1_k1_grid_desc,
+        const BK0N0N1K1GridDesc& b_k0_n0_n1_k1_grid_desc,
+        const CM0M10M11N0N10N11GridDesc& c_m0_m10_m11_n0_n10_n11_grid_desc,
+        const CBlockIdToM0N0BlockClusterAdaptor& c_blockid_to_m0_n0_block_cluster_adaptor,
+        integral_constant<bool, HasMainKBlockLoop>,
+        integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_grid, a_k0_m0_m1_k1_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_grid, b_k0_n0_n1_k1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_grid, c_m0_m10_m11_n0_n10_n11_grid_desc.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto c_m0_n0_block_cluster_idx =
+            c_blockid_to_m0_n0_block_cluster_adaptor.CalculateBottomIndex(
+                make_multi_index(get_block_1d_id()));
+
+        // HACK: this force index data into SGPR
+        const index_t im0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I0]);
+        const index_t in0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I1]);
+
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = K1;
+
+        // TODO: check alignment
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // A matrix in LDS memory, for blockwise GEMM
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, for blockwise GEMM
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
+
+        static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() ==
+                          a_k0_m_k1_block_desc.GetElementSpaceSize() &&
+                      b_k0_n0_n1_k1_block_desc.GetElementSpaceSize() ==
+                          b_k0_n_k1_block_desc.GetElementSpaceSize() &&
+                      "wrong!");
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
+            BlockSize,
+            InMemoryDataOperationEnum_t::Set,
+            Sequence<KPerBlock, 1, MPerBlockM1, K1.value>,
+            ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+            ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+            ABlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(a_k0_m0_m1_k1_grid_desc),
+            decltype(a_k0_m0_m1_k1_block_desc),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3>,
+            ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, // SrcVectorTensorLengths
+            ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, // DstVectorTensorLengths
+            ABlockTransferSrcVectorTensorContiguousDimOrder,  // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3>,                             // DstVectorTensorContiguousDimOrder
+            false,
+            true>(a_k0_m0_m1_k1_grid_desc,
+                  make_multi_index(0, im0, 0, 0),
+                  a_k0_m0_m1_k1_block_desc,
+                  make_multi_index(0, 0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
+            BlockSize,
+            InMemoryDataOperationEnum_t::Set,
+            Sequence<KPerBlock, 1, NPerBlockN1, K1.value>,
+            BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+            BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+            BBlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(b_k0_n0_n1_k1_grid_desc),
+            decltype(b_k0_n0_n1_k1_block_desc),
+            BBlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3>,
+            BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, // SrcVectorTensorLengths
+            BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, // DstVectorTensorLengths
+            BBlockTransferSrcVectorTensorContiguousDimOrder,  // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3>,                             // DstVectorTensorContiguousDimOrder
+            false,
+            true>(b_k0_n0_n1_k1_grid_desc,
+                  make_multi_index(0, in0, 0, 0),
+                  b_k0_n0_n1_k1_block_desc,
+                  make_multi_index(0, 0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[KPerBlock, MPerBlockM1] is in LDS
+        //     b_mtx[KPerBlocl, NPerBlockN1] is in LDS
+        //     c_mtx[MPerBlockM1, NPerBlockN1] is distributed among threads, and saved in
+        //       register
+        const auto blockwise_gemm =
+            BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2<
+                BlockSize,
+                FloatAB,
+                FloatAB,
+                FloatAcc,
+                decltype(a_k0_m_k1_block_desc),
+                decltype(b_k0_n_k1_block_desc),
+                M1PerThreadM111,
+                N1PerThreadN111,
+                KPerThread,
+                M11N11ThreadClusterM110Xs,
+                M11N11ThreadClusterN110Xs,
+                M1PerThreadM111,
+                N1PerThreadN111>{};
+
+        constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
+            decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
+
+        constexpr auto c_m10_m11_n10_n11_thread_desc = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
+            a_k0_m0_m1_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
+            b_k0_n0_n1_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block_double = p_shared_block;
+        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
+
+        // register allocation for output
+        auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
+            c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize());
+
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_m10_m11_n10_n11_thread_desc),
+                                    decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
+            .Run(c_m10_m11_n10_n11_thread_desc,
+                 make_tuple(I0, I0, I0, I0),
+                 c_thread_buf,
+                 FloatAcc{0});
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0, 0);
+
+        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double, a_k0_m0_m1_k1_block_desc.GetElementSpaceSize());
+        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double, b_k0_n0_n1_k1_block_desc.GetElementSpaceSize());
+
+        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double + a_block_aligned_space_size,
+            a_k0_m0_m1_k1_block_desc.GetElementSpaceSize());
+        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double + b_block_aligned_space_size,
+            b_k0_n0_n1_k1_block_desc.GetElementSpaceSize());
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
+
+            a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_even_buf);
+            b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_even_buf);
+        }
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            const auto K0 = a_k0_m0_m1_k1_grid_desc.GetLength(I0);
+
+            index_t k_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    AGridMoveSliceWindowStepHacks{});
+                b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    BGridMoveSliceWindowStepHacks{});
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+                b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(c_m10_m11_n10_n11_thread_desc,
+                                   a_block_even_buf,
+                                   b_block_even_buf,
+                                   c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_odd_buf);
+                b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_odd_buf);
+
+                // odd iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    AGridMoveSliceWindowStepHacks{});
+                b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    BGridMoveSliceWindowStepHacks{});
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+                b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_even_buf);
+                b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_even_buf);
+
+                k_block_data_begin += 2 * KPerBlock;
+            } while(k_block_data_begin < K0 - 2 * KPerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(
+                a_k0_m0_m1_k1_grid_desc, a_block_slice_copy_step, AGridMoveSliceWindowStepHacks{});
+            b_blockwise_copy.MoveSrcSliceWindow(
+                b_k0_n0_n1_k1_grid_desc, b_block_slice_copy_step, BGridMoveSliceWindowStepHacks{});
+
+            __syncthreads();
+
+            // LDS double buffer: load last data from device mem
+            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+            // LDS double buffer: store last data to LDS
+            a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_odd_buf);
+            b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_odd_buf);
+
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
+                               I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I2]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I3]>{}));
+
+            const auto c_m10_m11_n10_n11_thread_origin_idx_on_block =
+                blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+                    get_thread_local_1d_id());
+
+            ThreadwiseTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_m0_m10_m11_n0_n10_n11_thread_desc),
+                decltype(c_m0_m10_m11_n0_n10_n11_grid_desc),
+                Sequence<1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I0],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I1],
+                         1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I2],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I3]>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>{c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      make_multi_index(im0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I0],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I1],
+                                       in0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I2],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I3])}
+                .Run(c_m0_m10_m11_n0_n10_n11_thread_desc,
+                     make_tuple(I0, I0, I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_m0_m10_m11_n0_n10_n11_grid_desc,
+                     c_grid_buf,
+                     CGridStepHacks{});
+        }
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+#ifndef CK_GRIDWISE_GEMM_V2_HPP
+#define CK_GRIDWISE_GEMM_V2_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "blockwise_gemm_dlops_v3.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalStepHacks,
+          typename BGlobalStepHacks,
+          typename CGlobalStepHacks,
+          typename AGlobalMoveSliceWindowStepHacks,
+          typename BGlobalMoveSliceWindowStepHacks>
+struct GridwiseGemmDlops_km_kn_mn_v3
+{
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto E = EPerBlock * 3 * 3;
+
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_e_k_desc.GetElementSpaceSize(), max_lds_align);
+
+        return a_block_space_size * sizeof(FloatAB);
+    }
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        FloatAB* __restrict__ p_shared_block,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_global, a_e_k_global_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_global, b_e_n_ho_wo_global_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());
+
+        constexpr auto E = EPerBlock * 3 * 3;
+
+        // const auto E = a_e_k_global_desc.GetLength(I0);
+        const auto K = a_e_k_global_desc.GetLength(I1);
+
+        const auto N  = b_e_n_ho_wo_global_desc.GetLength(I1);
+        const auto Ho = b_e_n_ho_wo_global_desc.GetLength(I2);
+        const auto Wo = b_e_n_ho_wo_global_desc.GetLength(I3);
+
+// divide block work by [M, N]
+#if 0
+        const auto ho_block_work_num  = Ho / Number<HoPerBlock>{};
+        const auto wo_block_work_num  = Wo / Number<WoPerBlock>{};
+        const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+
+        const index_t k_block_work_id   = get_block_1d_id() / hwo_block_work_num;
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+
+        const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#else
+        // Hack: this force result into SGPR
+        const index_t ho_block_work_num  = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
+        const index_t wo_block_work_num  = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
+        const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+
+        const index_t k_block_work_id =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / hwo_block_work_num);
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+
+        const index_t ho_block_work_id =
+            __builtin_amdgcn_readfirstlane(hwo_block_work_id / wo_block_work_num);
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#endif
+
+        // lds max alignment
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
+
+        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_e_n_ho_wo_block_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
+
+        // c_thread_mtx definition: this is a mess
+        // TODO:: more elegent way of defining c_thread_mtx
+        constexpr auto c_k_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        auto blockwise_gemm =
+            BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
+                                                 FloatAB,
+                                                 FloatAB,
+                                                 FloatAcc,
+                                                 decltype(a_e_k_block_desc),
+                                                 decltype(b_e_n_ho_wo_block_desc),
+                                                 decltype(c_k_n_ho_wo_thread_desc),
+                                                 KPerThread,
+                                                 HoPerThread,
+                                                 WoPerThread,
+                                                 EPerThread,
+                                                 ABlockTransferSrcScalarPerVector,
+                                                 ABlockTransferDstScalarPerVector_K>{};
+
+        auto c_thread_mtx_index = blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
+
+        const auto k_thread_id  = c_thread_mtx_index.k;
+        const auto ho_thread_id = c_thread_mtx_index.h;
+        const auto wo_thread_id = c_thread_mtx_index.w;
+
+        const index_t k_block_data_on_global  = k_block_work_id * KPerBlock;
+        const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
+        const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
+
+        const index_t ho_thread_data_on_global =
+            ho_block_data_on_global + ho_thread_id * HoPerThread;
+        const index_t wo_thread_data_on_global =
+            wo_block_data_on_global + wo_thread_id * WoPerThread;
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<E, KPerBlock>,
+                                            ABlockTransferThreadSliceLengths_E_K,
+                                            ABlockTransferThreadClusterLengths_E_K,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_e_k_global_desc),
+                                            decltype(a_e_k_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<0, 1>,
+                                            ABlockTransferSrcVectorDim,
+                                            1,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_K,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(a_e_k_global_desc,
+                                                  make_multi_index(0, k_block_data_on_global),
+                                                  a_e_k_desc,
+                                                  make_multi_index(0, 0));
+
+        constexpr auto b_e_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        auto b_threadwise_transfer =
+            ThreadwiseTensorSliceTransfer_v2<FloatAB,
+                                             FloatAB,
+                                             decltype(b_e_n_ho_wo_global_desc),
+                                             decltype(b_e_n_ho_wo_thread_desc),
+                                             Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
+                                             BBlockTransferSrcAccessOrder,
+                                             BBlockTransferSrcVectorDim,
+                                             BBlockTransferSrcScalarPerVector,
+                                             1,
+                                             true>(
+                b_e_n_ho_wo_global_desc,
+                make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_shared_block, a_e_k_desc.GetElementSpaceSize());
+
+        // register allocation for output
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatAcc,
+                     c_k_n_ho_wo_thread_desc.GetElementSpaceSize(),
+                     true>
+            c_thread_buf;
+
+        // initialize output thread tensor
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_k_n_ho_wo_thread_desc),
+                                    Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
+            .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
+
+        constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_e_k_global_step_hacks       = AGlobalStepHacks{};
+        constexpr auto b_e_n_ho_wo_global_step_hacks = BGlobalStepHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_e_k_global_move_slice_window_step_hack = AGlobalMoveSliceWindowStepHacks{};
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
+            BGlobalMoveSliceWindowStepHacks{};
+
+        // double regsiter buffer for b
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatAB,
+                     b_e_n_ho_wo_thread_desc.GetElementSpaceSize(),
+                     true>
+            b_thread_even_buf, b_thread_odd_buf;
+
+        // LDS double buffer: preload data
+        {
+            a_blockwise_copy.RunRead(a_e_k_global_desc, a_global_buf, a_e_k_global_step_hacks);
+
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      b_global_buf,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_even_buf,
+                                      b_e_n_ho_wo_global_step_hacks);
+
+            a_blockwise_copy.RunWrite(a_e_k_desc, a_block_buf);
+        }
+
+        __syncthreads();
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t e_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          b_global_buf,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_odd_buf,
+                                          b_e_n_ho_wo_global_step_hacks);
+
+                // LDS double buffer: GEMM on current data
+                // TODO: @Zhang Jing: blockwise gemm should be able to move slice window
+                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          b_global_buf,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_even_buf,
+                                          b_e_n_ho_wo_global_step_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+
+                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+                e_block_data_begin += 2 * EPerBlock;
+
+            } while(e_block_data_begin < E - 2 * EPerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                     b_thread_slice_copy_step);
+
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      b_global_buf,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_odd_buf,
+                                      b_e_n_ho_wo_global_step_hacks);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+            blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
+            constexpr auto c_k_n_ho_wo_global_tensor_step_hacks = CGlobalStepHacks{};
+
+            const index_t k_thread_data_on_global =
+                k_block_data_on_global + k_thread_id * KPerThread;
+
+            ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                               FloatC,
+                                               decltype(c_k_n_ho_wo_thread_desc),
+                                               decltype(c_k_n_ho_wo_global_desc),
+                                               Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
+                                               CThreadTransferSrcDstAccessOrder,
+                                               CThreadTransferSrcDstVectorDim,
+                                               CThreadTransferDstScalarPerVector,
+                                               CGlobalMemoryDataOperation,
+                                               1,
+                                               true>(
+                c_k_n_ho_wo_global_desc,
+                make_multi_index(
+                    k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
+                .Run(c_k_n_ho_wo_thread_desc,
+                     make_tuple(I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_k_n_ho_wo_global_desc,
+                     c_global_buf,
+                     c_k_n_ho_wo_global_tensor_step_hacks);
+        }
+    }
+
+    // pass tensor descriptor by reference
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+        __shared__ FloatAB p_shared_block[shared_block_size];
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            p_shared_block,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+
+    // pass tensor descriptors by their pointers
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc* p_a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc* p_b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc* p_c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        const auto a_e_k_global_desc       = *p_a_e_k_global_desc;
+        const auto b_e_n_ho_wo_global_desc = *p_b_e_n_ho_wo_global_desc;
+        const auto c_k_n_ho_wo_global_desc = *p_c_k_n_ho_wo_global_desc;
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+
+    // pass tensor descriptors by void*
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const void* p_a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const void* p_b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const void* p_c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        const auto a_e_k_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_e_k_global_desc);
+        const auto b_e_n_ho_wo_global_desc =
+            *reinterpret_cast<const BGlobalDesc*>(p_b_e_n_ho_wo_global_desc);
+        const auto c_k_n_ho_wo_global_desc =
+            *reinterpret_cast<const CGlobalDesc*>(p_c_k_n_ho_wo_global_desc);
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+#ifndef CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP
+#define CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_xdlops.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
+
+namespace ck {
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CM0M1M2NGridDesc,
+          typename CBlockClusterAdaptor>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                                const FloatAB* __restrict__ p_b_grid,
+                                FloatC* __restrict__ p_c_grid,
+                                const AK0MK1GridDesc a_k0_m_k1_grid_desc,
+                                const BK0NK1GridDesc b_k0_n_k1_grid_desc,
+                                const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc,
+                                const CBlockClusterAdaptor c_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_block_cluster_adaptor);
+}
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CM0M1M2NGridDesc,
+          typename CBlockClusterAdaptor>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                                const FloatAB* __restrict__ p_b_grid,
+                                FloatC* __restrict__ p_c_grid,
+                                const void CONSTANT* p_a_k0_m_k1_grid_desc,
+                                const void CONSTANT* p_b_k0_n_k1_grid_desc,
+                                const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
+                                const void CONSTANT* p_c_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    const auto a_k0_m_k1_grid_desc = *reinterpret_cast<const AK0MK1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_a_k0_m_k1_grid_desc));
+    const auto b_k0_n_k1_grid_desc = *reinterpret_cast<const BK0NK1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_b_k0_n_k1_grid_desc));
+    const auto c_m0_m1_m2_n_grid_desc = *reinterpret_cast<const CM0M1M2NGridDesc*>(
+        cast_pointer_to_generic_address_space(p_c_m0_m1_m2_n_grid_desc));
+    const auto c_block_cluster_adaptor = *reinterpret_cast<const CBlockClusterAdaptor*>(
+        cast_pointer_to_generic_address_space(p_c_block_cluster_adaptor));
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_block_cluster_adaptor);
+}
+#endif
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t K1Value,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks,
+          bool CAccessOrderMRepeatNRepeat>
+struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = Number<K1Value>{};
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return (a_block_space_size + b_block_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                  const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                  const CMNGridDesc& c_m_n_grid_desc)
+    {
+        // TODO: turn on this
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+                K0 == b_k0_n_k1_grid_desc.GetLength(I0) &&
+                K1 == a_k0_m_k1_grid_desc.GetLength(I2) &&
+                K1 == b_k0_n_k1_grid_desc.GetLength(I2)) &&
+               (M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % KPerBlock == 0) &&
+               (MPerBlock % MPerWave == 0 && NPerBlock % NPerWave == 0);
+    }
+
+    __host__ __device__ static constexpr index_t
+    CalculateGridSize(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        const index_t grid_size = (M / MPerBlock) * (N / NPerBlock);
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M1M2NGridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerWave, NPerWave, K1>{};
+
+        constexpr auto CLayout = xdlops_gemm.GetCLayout();
+
+        constexpr auto M0 = Number<CLayout.M1()>{};
+        constexpr auto M1 = Number<CLayout.N1()>{};
+        constexpr auto M2 = Number<CLayout.M0()>{};
+
+        constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
+        constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
+
+        constexpr auto N1 = Number<CLayout.N0()>{};
+
+        const auto c_m0_m1_m2_n_grid_desc = transform_tensor_descriptor(
+            c_m_n_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, M0, M1, M2)),
+                       make_unmerge_transform(make_tuple(NRepeat, NWaves, N1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2, 4, 5, 6>{}, Sequence<1, 3, 7>{}));
+
+        return c_m0_m1_m2_n_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlock>{};
+        constexpr auto N1 = Number<NPerBlock>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+#if 1
+        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))),
+                                             make_tuple(Sequence<0, 1>{}),
+                                             make_tuple(Sequence<0>{}));
+#elif 1
+        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(N0, M0))),
+                                             make_tuple(Sequence<1, 0>{}),
+                                             make_tuple(Sequence<0>{}));
+#endif
+
+        return c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    using CM0M1M2NGridDesc     = decltype(MakeCM0M1M2NGridDescriptor(CMNGridDesc{}));
+    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}));
+
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               FloatAB* __restrict__ p_shared_block,
+                               const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                               const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                               const CM0M1M2NGridDesc& c_m0_m1_m2_n_grid_desc,
+                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_grid, a_k0_m_k1_grid_desc.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_grid, b_k0_n_k1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_grid, c_m0_m1_m2_n_grid_desc.GetElementSpaceSize());
+
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, MPerBlock, K1>,
+                                            ABlockTransferThreadSliceLengths_K0_M_K1,
+                                            ABlockTransferThreadClusterLengths_K0_M_K1,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_k0_m_k1_grid_desc),
+                                            decltype(a_k0_m_k1_block_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<1, 0, 2>,
+                                            ABlockTransferSrcVectorDim,
+                                            2,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_K1,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(a_k0_m_k1_grid_desc,
+                                                  make_multi_index(0, m_block_data_idx_on_grid, 0),
+                                                  a_k0_m_k1_block_desc,
+                                                  make_multi_index(0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, NPerBlock, K1>,
+                                            BBlockTransferThreadSliceLengths_K0_N_K1,
+                                            BBlockTransferThreadClusterLengths_K0_N_K1,
+                                            BBlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(b_k0_n_k1_grid_desc),
+                                            decltype(b_k0_n_k1_block_desc),
+                                            BBlockTransferSrcAccessOrder,
+                                            Sequence<1, 0, 2>,
+                                            BBlockTransferSrcVectorDim,
+                                            2,
+                                            BBlockTransferSrcScalarPerVector,
+                                            BBlockTransferDstScalarPerVector_K1,
+                                            1,
+                                            1,
+                                            BThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(b_k0_n_k1_grid_desc,
+                                                  make_multi_index(0, n_block_data_idx_on_grid, 0),
+                                                  b_k0_n_k1_block_desc,
+                                                  make_multi_index(0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[KPerBlock, MPerBlock] is in LDS
+        //     b_mtx[KPerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+
+        static_assert(MPerBlock % (MPerWave * MRepeat) == 0 &&
+                          NPerBlock % (NPerWave * NRepeat) == 0,
+                      "wrong!");
+
+        constexpr auto a_k0_m0_m1_k1_block_desc = transform_tensor_descriptor(
+            a_k0_m_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<MRepeat>{}, Number<MPerBlock / MRepeat>{})),
+                       make_pass_through_transform(K1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        constexpr auto b_k0_n0_n1_k1_block_desc = transform_tensor_descriptor(
+            b_k0_n_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<NRepeat>{}, Number<NPerBlock / NRepeat>{})),
+                       make_pass_through_transform(K1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        const auto blockwise_gemm =
+            BlockwiseGemmXdlops_km_kn_m0m1m2n_v1<BlockSize,
+                                                 FloatAB,
+                                                 decltype(a_k0_m0_m1_k1_block_desc),
+                                                 decltype(b_k0_n0_n1_k1_block_desc),
+                                                 MPerWave,
+                                                 NPerWave,
+                                                 K1>{};
+
+        constexpr auto CLayout = blockwise_gemm.GetCLayout();
+
+        constexpr index_t BlkSize   = CLayout.GetBlkSize();
+        constexpr index_t NumBlks   = CLayout.GetNumBlks();
+        constexpr index_t NumXdlops = CLayout.GetNumXdlops();
+
+        static_assert(NumBlks == 1 && NumXdlops == 1, "K Reduction Mfma only");
+
+        constexpr auto c_mr_nr_blk_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     vector_type<FloatAcc, BlkSize>,
+                     c_mr_nr_blk_desc.GetElementSpaceSize(),
+                     true>
+            c_thread_buf;
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block = p_shared_block;
+        FloatAB* p_b_block = p_shared_block + a_block_space_size;
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_k0_m_k1_grid_step_hacks = AGridStepHacks{};
+        constexpr auto b_k0_n_k1_grid_step_hacks = BGridStepHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_k0_m_k1_grid_move_slice_window_step_hack = AGridMoveSliceWindowStepHacks{};
+        constexpr auto b_k0_n_k1_grid_move_slice_window_step_hack = BGridMoveSliceWindowStepHacks{};
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
+
+        // preload data into LDS
+        {
+            a_blockwise_copy.RunRead(a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_step_hacks);
+            b_blockwise_copy.RunRead(b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_step_hacks);
+
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
+        }
+
+        // main body
+        index_t k_block_data_begin = 0;
+
+        do
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_k0_m_k1_grid_desc,
+                                                a_block_slice_copy_step,
+                                                a_k0_m_k1_grid_move_slice_window_step_hack);
+            b_blockwise_copy.MoveSrcSliceWindow(b_k0_n_k1_grid_desc,
+                                                b_block_slice_copy_step,
+                                                b_k0_n_k1_grid_move_slice_window_step_hack);
+
+            a_blockwise_copy.RunRead(a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_step_hacks);
+
+            block_sync_lds();
+
+            b_blockwise_copy.RunRead(b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_step_hacks);
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+            block_sync_lds();
+
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
+
+            k_block_data_begin += KPerBlock;
+        } while(k_block_data_begin < (K0 - KPerBlock));
+
+        // tail
+        {
+            block_sync_lds();
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+
+#if 0
+        // output: register to global memory
+        {
+            constexpr index_t M0 = CLayout.M1();
+            constexpr index_t M1 = CLayout.N1();
+            constexpr index_t M2 = CLayout.M0();
+
+            constexpr index_t N0 = CLayout.N1();
+            constexpr index_t N1 = CLayout.N0();
+
+            constexpr auto c_m0_m1_m2_n_thread_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                                          Number<NRepeat>{},
+                                                                          Number<1>{},
+                                                                          Number<1>{},
+                                                                          Number<M0>{},
+                                                                          Number<1>{},
+                                                                          Number<M2>{},
+                                                                          Number<1>{}));
+
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatC, c_m0_m1_m2_n_thread_desc.GetElementSpaceSize(), true>
+                c_blk_buf_;
+
+            static_for<0, MRepeat, 1>{}([&](auto mr_i) {
+                static_for<0, NRepeat, 1>{}([&](auto nr_i) {
+                    constexpr auto blk_off =
+                        c_mr_nr_blk_desc.CalculateOffset(make_tuple(mr_i, nr_i));
+
+                    static_for<0, BlkSize, 1>{}([&](auto j) {
+                        c_blk_buf_(Number<blk_off * BlkSize + j>{}) =
+                            c_thread_buf[Number<blk_off>{}]
+                                .template AsType<FloatAcc>()[Number<j>{}];
+                    });
+                });
+            });
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_grid =
+                m_block_data_idx_on_grid + c_thread_mtx_on_block[I0];
+
+            const index_t n_thread_data_on_grid =
+                n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
+
+            constexpr auto c_m0_m1_m2_n_grid_tensor_step_hacks = CGridStepHacks{};
+
+            constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
+            constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
+
+            ThreadwiseTensorSliceTransfer_v1r3<
+                FloatC,
+                FloatC,
+                decltype(c_m0_m1_m2_n_thread_desc),
+                decltype(c_m0_m1_m2_n_grid_desc),
+                Sequence<MRepeat, NRepeat, 1, 1, M0, 1, M2, 1>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>{
+                c_m0_m1_m2_n_grid_desc,
+                make_multi_index(m_thread_data_on_grid / (M2 * M1 * M0 * MWaves),
+                                 n_thread_data_on_grid / (N1 * NWaves),
+                                 m_thread_data_on_grid % (M2 * M1 * M0 * MWaves) / (M2 * M1 * M0),
+                                 n_thread_data_on_grid % (N1 * NWaves) / N1,
+                                 m_thread_data_on_grid % (M2 * M1 * M0) / (M2 * M1),
+                                 m_thread_data_on_grid % (M2 * M1) / M2,
+                                 m_thread_data_on_grid % M2,
+                                 n_thread_data_on_grid % N1)}
+                .Run(c_m0_m1_m2_n_thread_desc,
+                     make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                     c_blk_buf_,
+                     c_m0_m1_m2_n_grid_desc,
+                     c_grid_buf,
+                     c_m0_m1_m2_n_grid_tensor_step_hacks);
+        }
+#else
+        {
+            constexpr index_t M0 = CLayout.M1();
+            constexpr index_t M1 = CLayout.N1();
+            constexpr index_t M2 = CLayout.M0();
+
+            constexpr auto c_m0_m1_m2_n_thread_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(I1, I1, I1, I1, Number<M0>{}, Number<1>{}, Number<M2>{}, Number<1>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_grid =
+                m_block_data_idx_on_grid + c_thread_mtx_on_block[I0];
+
+            const index_t n_thread_data_on_grid =
+                n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
+
+            constexpr auto c_m0_m1_m2_n_grid_tensor_step_hacks = CGridStepHacks{};
+
+            auto c_thread_copy =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatC,
+                                                   FloatC,
+                                                   decltype(c_m0_m1_m2_n_thread_desc),
+                                                   decltype(c_m0_m1_m2_n_grid_desc),
+                                                   Sequence<1, 1, 1, 1, M0, 1, M2, 1>,
+                                                   CThreadTransferSrcDstAccessOrder,
+                                                   CThreadTransferSrcDstVectorDim,
+                                                   CThreadTransferDstScalarPerVector,
+                                                   CGlobalMemoryDataOperation,
+                                                   1,
+                                                   true>{
+                    c_m0_m1_m2_n_grid_desc,
+                    make_multi_index(0,
+                                     0,
+                                     0,
+                                     0,
+                                     m_thread_data_on_grid / (M2 * M1),
+                                     m_thread_data_on_grid % (M2 * M1) / M2,
+                                     m_thread_data_on_grid % M2,
+                                     n_thread_data_on_grid)};
+
+            auto init_copy = [&](auto c_thread_idx_) {
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
+
+                return c_thread_idx_;
+            };
+
+            auto mrepeat_plus_copy = [&](auto c_thread_idx_) {
+                constexpr auto mrepeat_step_plus = make_multi_index(1, 0, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, mrepeat_step_plus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
+            };
+
+            auto nrepeat_plus_copy = [&](auto c_thread_idx_) {
+                constexpr auto nrepeat_step_plus = make_multi_index(0, 1, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, nrepeat_step_plus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
+            };
+
+            auto mrepeat_minus_copy = [&](auto c_thread_idx_) {
+                constexpr auto mrepeat_step_plus = make_multi_index(-1, 0, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, mrepeat_step_plus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
+            };
+
+            auto nrepeat_minus_copy = [&](auto c_thread_idx_) {
+                constexpr auto nrepeat_step_minus = make_multi_index(0, -1, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, nrepeat_step_minus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
+            };
+
+            static_assert((MRepeat == 4 && NRepeat == 4) or (MRepeat == 4 && NRepeat == 2) or
+                              (MRepeat == 2 && NRepeat == 4) or (MRepeat == 2 && NRepeat == 2) or
+                              (MRepeat == 2 && NRepeat == 1) or (MRepeat == 1 && NRepeat == 2) or
+                              (MRepeat == 1 && NRepeat == 1),
+                          "wrong");
+
+            if constexpr(MRepeat == 4 && NRepeat == 4)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    nrepeat_plus_copy(make_tuple(I0, I3));
+                    mrepeat_plus_copy(make_tuple(I1, I3));
+                    nrepeat_minus_copy(make_tuple(I1, I2));
+                    nrepeat_minus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    nrepeat_plus_copy(make_tuple(I2, I1));
+                    nrepeat_plus_copy(make_tuple(I2, I2));
+                    nrepeat_plus_copy(make_tuple(I2, I3));
+                    mrepeat_plus_copy(make_tuple(I3, I3));
+                    nrepeat_minus_copy(make_tuple(I3, I2));
+                    nrepeat_minus_copy(make_tuple(I3, I1));
+                    nrepeat_minus_copy(make_tuple(I3, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    mrepeat_plus_copy(make_tuple(I3, I0));
+                    nrepeat_plus_copy(make_tuple(I3, I1));
+                    mrepeat_minus_copy(make_tuple(I2, I1));
+                    mrepeat_minus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    mrepeat_plus_copy(make_tuple(I1, I2));
+                    mrepeat_plus_copy(make_tuple(I2, I2));
+                    mrepeat_plus_copy(make_tuple(I3, I2));
+                    nrepeat_plus_copy(make_tuple(I3, I3));
+                    mrepeat_minus_copy(make_tuple(I2, I3));
+                    mrepeat_minus_copy(make_tuple(I1, I3));
+                    mrepeat_minus_copy(make_tuple(I0, I3));
+                }
+            }
+            else if constexpr(MRepeat == 4 && NRepeat == 2)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    mrepeat_plus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    nrepeat_plus_copy(make_tuple(I2, I1));
+                    mrepeat_plus_copy(make_tuple(I3, I1));
+                    nrepeat_minus_copy(make_tuple(I3, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    mrepeat_plus_copy(make_tuple(I3, I0));
+                    nrepeat_plus_copy(make_tuple(I3, I1));
+                    mrepeat_minus_copy(make_tuple(I2, I1));
+                    mrepeat_minus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                }
+            }
+            else if constexpr(MRepeat == 2 && NRepeat == 4)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    nrepeat_plus_copy(make_tuple(I0, I3));
+                    mrepeat_plus_copy(make_tuple(I1, I3));
+                    nrepeat_minus_copy(make_tuple(I1, I2));
+                    nrepeat_minus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    nrepeat_plus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    mrepeat_plus_copy(make_tuple(I1, I2));
+                    nrepeat_plus_copy(make_tuple(I1, I3));
+                    mrepeat_minus_copy(make_tuple(I0, I3));
+                }
+            }
+            else if constexpr(MRepeat == 2 && NRepeat == 2)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    mrepeat_plus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    nrepeat_plus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                }
+            }
+            else if constexpr(MRepeat == 2 && NRepeat == 1)
+            {
+                init_copy(make_tuple(I0, I0));
+                mrepeat_plus_copy(make_tuple(I1, I0));
+            }
+            else if constexpr(MRepeat == 1 && NRepeat == 2)
+            {
+                init_copy(make_tuple(I0, I0));
+                nrepeat_plus_copy(make_tuple(I0, I1));
+            }
+            else if constexpr(MRepeat == 1 && NRepeat == 1)
+            {
+                init_copy(make_tuple(I0, I0));
+            }
+        }
+#endif
+    }
+}; // namespace ck
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
+#ifndef CK_THREADWISE_CONTRACTION_DLOPS_HPP
+#define CK_THREADWISE_CONTRACTION_DLOPS_HPP
+
+#include "common_header.hpp"
+#include "math.hpp"
+
+namespace ck {
+
+// C[TM0, TM1, TN0, TN1] += A[TK, TM0, TM1] * B[TK, TN0, TN1]
+//   Tensor element can be vectorized data
+// Assume:
+//   1. AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1, CThreadDesc_TM0_TM1_TN0_TN1 are
+//   known at compile-time
+//   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AThreadDesc_TK0_TM0_TM1_TK1,
+          typename BThreadDesc_TK0_TN0_TN1_TK1,
+          typename CThreadDesc_TM0_TM1_TN0_TN1,
+          typename TKLengths,
+          typename TMLengths,
+          typename TNLengths,
+          typename enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                                 BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                                 CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
+{
+    __device__ constexpr ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1()
+    {
+        static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                          BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                          CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: sanity-check: compare AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1,
+        // CThreadDesc_TM0_TM1_TN0_TN1 Size with KLenghts, TMLengths and TNLengths
+
+        // TODO remove this restriction
+        static_assert(TKLengths::Size() == 1 && TMLengths::Size() == 2 && TNLengths::Size() == 2,
+                      "wrong!");
+    }
+
+    template <typename ABuffer,
+              typename AOriginIdx,
+              typename BBuffer,
+              typename BOriginIdx,
+              typename CBuffer,
+              typename COriginIdx>
+    __device__ static void Run(const ABuffer& a_buf,
+                               AOriginIdx,
+                               const BBuffer& b_buf,
+                               BOriginIdx,
+                               CBuffer& c_buf,
+                               COriginIdx)
+    {
+        static_assert(
+            is_known_at_compile_time<remove_cv_t<remove_reference_t<AOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<BOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<COriginIdx>>>::value,
+            "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename ABuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatA>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename BBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatB>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename CBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatC>>>::value &&
+                      "wrong! inconsistent type");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr auto TK  = TKLengths{}[I0];
+        constexpr auto TM0 = TMLengths{}[I0];
+        constexpr auto TM1 = TMLengths{}[I1];
+        constexpr auto TN0 = TNLengths{}[I0];
+        constexpr auto TN1 = TNLengths{}[I1];
+
+        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
+        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
+        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
+
+        static_for<0, TK, 1>{}([&](auto tk) {
+            static_for<0, TM0, 1>{}([&](auto tm0) {
+                static_for<0, TM1, 1>{}([&](auto tm1) {
+                    static_for<0, TN0, 1>{}([&](auto tn0) {
+                        static_for<0, TN1, 1>{}([&](auto tn1) {
+                            constexpr index_t a_offset =
+                                AThreadDesc_TK0_TM0_TM1_TK1{}.CalculateOffset(
+                                    a_origin_idx + make_multi_index(tk, tm0, tm1));
+                            constexpr index_t b_offset =
+                                BThreadDesc_TK0_TN0_TN1_TK1{}.CalculateOffset(
+                                    b_origin_idx + make_multi_index(tk, tn0, tn1));
+                            constexpr index_t c_offset =
+                                CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
+                                    c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
+
+                            inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
+                                                                  b_buf[Number<b_offset>{}],
+                                                                  c_buf(Number<c_offset>{}));
+                        });
+                    });
+                });
+            });
+        });
+    }
+};
+
+// C[TM0, TM1, TN0, TN1] += A[TK0, TM0, TM1, TK1] * B[TK0, TN0, TN1, TK1]
+//   Tensor element can be vectorized data
+// Assume:
+//   1. AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1, CThreadDesc_TM0_TM1_TN0_TN1 are
+//      known at compile-time
+//   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AThreadDesc_TK0_TM0_TM1_TK1,
+          typename BThreadDesc_TK0_TN0_TN1_TK1,
+          typename CThreadDesc_TM0_TM1_TN0_TN1,
+          typename TKLengths,
+          typename TMLengths,
+          typename TNLengths,
+          typename enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                                 BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                                 CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
+{
+    __device__ constexpr ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1()
+    {
+        static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                          BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                          CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: sanity-check: compare AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1,
+        // CThreadDesc_TM0_TM1_TN0_TN1 Size with KLenghts, TMLengths and TNLengths
+
+        // TODO remove this restriction
+        static_assert(TKLengths::Size() == 2 && TMLengths::Size() == 2 && TNLengths::Size() == 2,
+                      "wrong!");
+    }
+
+    template <typename ABuffer,
+              typename AOriginIdx,
+              typename BBuffer,
+              typename BOriginIdx,
+              typename CBuffer,
+              typename COriginIdx>
+    __device__ static void Run(const ABuffer& a_buf,
+                               AOriginIdx,
+                               const BBuffer& b_buf,
+                               BOriginIdx,
+                               CBuffer& c_buf,
+                               COriginIdx)
+    {
+        static_assert(
+            is_known_at_compile_time<remove_cv_t<remove_reference_t<AOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<BOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<COriginIdx>>>::value,
+            "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename ABuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatA>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename BBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatB>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename CBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatC>>>::value &&
+                      "wrong! inconsistent type");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr index_t TK0 = TKLengths{}[I0];
+        constexpr index_t TK1 = TKLengths{}[I1];
+        constexpr index_t TM0 = TMLengths{}[I0];
+        constexpr index_t TM1 = TMLengths{}[I1];
+        constexpr index_t TN0 = TNLengths{}[I0];
+        constexpr index_t TN1 = TNLengths{}[I1];
+
+        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
+        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
+        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
+
+        static_for<0, TK0, 1>{}([&](auto tk0) {
+            static_for<0, TM0, 1>{}([&](auto tm0) {
+                static_for<0, TM1, 1>{}([&](auto tm1) {
+                    static_for<0, TN0, 1>{}([&](auto tn0) {
+                        static_for<0, TN1, 1>{}([&](auto tn1) {
+                            vector_type<FloatA, TK1> a_vec;
+                            vector_type<FloatB, TK1> b_vec;
+
+                            static_for<0, TK1, 1>{}([&](auto tk1) {
+                                constexpr index_t a_offset =
+                                    AThreadDesc_TK0_TM0_TM1_TK1{}.CalculateOffset(
+                                        a_origin_idx + make_multi_index(tk0, tm0, tm1, tk1));
+
+                                constexpr index_t b_offset =
+                                    BThreadDesc_TK0_TN0_TN1_TK1{}.CalculateOffset(
+                                        b_origin_idx + make_multi_index(tk0, tn0, tn1, tk1));
+
+                                a_vec.template AsType<FloatA>()(tk1) = a_buf[Number<a_offset>{}];
+                                b_vec.template AsType<FloatB>()(tk1) = b_buf[Number<b_offset>{}];
+                            });
+
+                            using a_vector_t = typename vector_type<FloatA, TK1>::type;
+                            using b_vector_t = typename vector_type<FloatB, TK1>::type;
+
+                            constexpr index_t c_offset =
+                                CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
+                                    c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
+
+                            inner_product<a_vector_t, b_vector_t, FloatC>(
+                                a_vec.template AsType<a_vector_t>()[I0],
+                                b_vec.template AsType<b_vector_t>()[I0],
+                                c_buf(Number<c_offset>{}));
+                        });
+                    });
+                });
+            });
+        });
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
+#ifndef CK_THREADWISE_GEMM_DLOPS_V3_HPP
+#define CK_THREADWISE_GEMM_DLOPS_V3_HPP
+
+#include "common_header.hpp"
+#include "math.hpp"
+
+namespace ck {
+
+// C[M, N] += transpose(A[K, M]) * B[K, N]
+//   Element of matrix can be vectorized data
+// Assume:
+//   1. ADesc, BDesc, CDesc are known at compile-time
+//   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ADesc,
+          typename BDesc,
+          typename CDesc,
+          index_t H,
+          index_t W,
+          typename enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
+                                 CDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseGemmDlops_km_kn_mn_v3
+{
+    template <typename ABuffer,
+              typename AOriginIdx,
+              typename BBuffer,
+              typename BOriginIdx,
+              typename CBuffer,
+              typename COriginIdx>
+    __device__ static void Run(const ABuffer& a_buf,
+                               AOriginIdx,
+                               const BBuffer& b_buf,
+                               BOriginIdx,
+                               CBuffer& c_buf,
+                               COriginIdx)
+    {
+        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
+                          CDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(
+            is_known_at_compile_time<remove_cv_t<remove_reference_t<AOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<BOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<COriginIdx>>>::value,
+            "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename ABuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatA>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename BBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatB>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename CBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatC>>>::value &&
+                      "wrong! inconsistent type");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr auto E = ADesc{}.GetLength(I0);
+        constexpr auto K = ADesc{}.GetLength(I1);
+
+        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
+        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
+        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
+
+        static_for<0, E, 1>{}([&](auto e) {
+            static_for<0, K, 1>{}([&](auto k) {
+                constexpr index_t a_offset =
+                    ADesc{}.CalculateOffset(a_origin_idx + make_tuple(e, k));
+
+                if constexpr(H == 2 && W == 2)
+                {
+                    constexpr index_t b_offset_0 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
+                    constexpr index_t b_offset_1 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 1));
+                    constexpr index_t b_offset_2 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
+                    constexpr index_t b_offset_3 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 1));
+
+                    constexpr index_t c_offset_0 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
+                    constexpr index_t c_offset_1 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 1));
+                    constexpr index_t c_offset_2 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
+                    constexpr index_t c_offset_3 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 1));
+
+                    amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
+                                                   b_buf[Number<b_offset_0>{}],
+                                                   b_buf[Number<b_offset_1>{}],
+                                                   b_buf[Number<b_offset_2>{}],
+                                                   b_buf[Number<b_offset_3>{}],
+                                                   c_buf(Number<c_offset_0>{}),
+                                                   c_buf(Number<c_offset_1>{}),
+                                                   c_buf(Number<c_offset_2>{}),
+                                                   c_buf(Number<c_offset_3>{}));
+                }
+                else if constexpr(H == 4 && W == 1)
+                {
+                    constexpr index_t b_offset_0 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
+                    constexpr index_t b_offset_1 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
+                    constexpr index_t b_offset_2 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 2, 0));
+                    constexpr index_t b_offset_3 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 3, 0));
+
+                    constexpr index_t c_offset_0 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
+                    constexpr index_t c_offset_1 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
+                    constexpr index_t c_offset_2 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 2, 0));
+                    constexpr index_t c_offset_3 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 3, 0));
+
+                    amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
+                                                   b_buf[Number<b_offset_0>{}],
+                                                   b_buf[Number<b_offset_1>{}],
+                                                   b_buf[Number<b_offset_2>{}],
+                                                   b_buf[Number<b_offset_3>{}],
+                                                   c_buf(Number<c_offset_0>{}),
+                                                   c_buf(Number<c_offset_1>{}),
+                                                   c_buf(Number<c_offset_2>{}),
+                                                   c_buf(Number<c_offset_3>{}));
+                }
+                else
+                {
+                    static_for<0, H, 1>{}([&](auto h) {
+                        static_for<0, W, 1>{}([&](auto w) {
+                            constexpr index_t b_offset =
+                                BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, h, w));
+
+                            constexpr index_t c_offset =
+                                CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w));
+
+#if 0
+                            c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
+                                a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
+#else
+                            amd_assembly_inner_product(a_buf[Number<a_offset>{}],
+                                                       b_buf[Number<b_offset>{}],
+                                                       c_buf(Number<c_offset>{}));
+#endif
+                        });
+                    });
+                }
+            });
+        });
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
+#ifndef CK_THREADWISE_TENSOR_SET_HPP
+#define CK_THREADWISE_TENSOR_SET_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Assume:
+//   1. Desc is known at compile-time
+//   2. Buffer is StaticBuffer
+//   3. OriginIdx is known at compile-time
+//   4. use #-step
+template <typename Data,
+          typename Desc,
+          typename SliceLengths,
+          typename enable_if<Desc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseTensorSliceSet_v1
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    template <typename OriginIdx, typename Buffer>
+    __device__ void Run(const Desc&, const OriginIdx&, Buffer& buf, const Data& initial_value) const
+    {
+        static_assert(Desc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(Buffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+
+        static_assert(is_known_at_compile_time<remove_cv_t<remove_reference_t<OriginIdx>>>::value,
+                      "wrong! OriginIdx need to be known at compile-time");
+
+        // Desc is known at compile-time
+        constexpr auto desc = remove_cv_t<remove_reference_t<Desc>>{};
+
+        // OriginIdx is known at compile-time
+        constexpr auto origin_idx = to_multi_index(OriginIdx{});
+
+        static_ford<SliceLengths>{}([&](auto access_idx) {
+            constexpr auto coord = make_tensor_coordinate(desc, origin_idx + access_idx);
+
+            constexpr bool is_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(desc, coord);
+
+            constexpr index_t offset = coord.GetOffset();
+
+            if constexpr(is_valid)
+            {
+                buf(Number<offset>{}) = initial_value;
+            }
+        });
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP
+#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
+// and sometimes useless instructions:
+//   1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
+//   instead
+//   2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
+//   tensor coordinate instead
+//   3. Don't use a pointer to VGPR buffer, use vector instead
+
+namespace detail {
+// TODO: How to fix this? It uses an struct instead of lambda because lambda
+// doesn't have constructor
+template <index_t VectorDim, index_t ScalarPerVector>
+struct lambda_scalar_per_access
+{
+    __host__ __device__ constexpr auto operator()(index_t i) const
+    {
+        return (i == VectorDim) ? ScalarPerVector : 1;
+    }
+};
+
+template <index_t VectorDim>
+struct lambda_scalar_step_in_vector
+{
+    __host__ __device__ constexpr auto operator()(index_t i) const
+    {
+        return (i == VectorDim) ? 1 : 0;
+    }
+};
+} // namespace detail
+
+// Assume:
+//   1. src:
+//     1. SrcDesc is known at compile-time
+//     2. SrcBuffer is StaticBuffer
+//     3. SrcSliceOrginIdx is known at compile-time
+//   2. dst:
+//     1. DstDesc is not known at compile-time
+//     2. DstBuffer is DynamicBuffer
+//     3. DstSliceOrginIdx is not known at compile time
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t DstVectorDim,
+          index_t DstScalarPerVector,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          index_t DstScalarStrideInVector,
+          bool DstResetCoordinateAfterRun,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v1r3
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3(const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin_idx)
+        : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx))
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc need to known at compile-time");
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcSliceOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer,
+              typename DstStepHacks>
+    __device__ void Run(const SrcDesc&,
+                        const SrcSliceOriginIdx&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf,
+                        const DstStepHacks& dst_step_hacks)
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc need to known at compile-time");
+
+        static_assert(
+            is_known_at_compile_time<remove_cv_t<remove_reference_t<SrcSliceOriginIdx>>>::value,
+            "wrong! SrcSliceOrigin need to known at compile-time");
+
+        static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
+
+        // static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+        // remove_cv_t<remove_reference_t<SrcData>>>::value,
+        //"wrong! SrcBuffer data type is wrong");
+
+        // SrcDesc and src_slice_origin_idx are known at compile-time
+        constexpr auto src_desc             = remove_cv_t<remove_reference_t<SrcDesc>>{};
+        constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i]
+                                         ? ordered_access_idx[i]
+                                         : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                       dst_scalar_per_access;
+            }();
+
+            typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
+
+            using dst_vector_t =
+                typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
+
+            // copy data from src_buf into dst_vector
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t src_offset = src_desc.CalculateOffset(
+                    src_slice_origin_idx + dst_data_idx + i * dst_scalar_step_in_vector);
+
+                dst_vector.template AsType<DstData>()(i) =
+                    type_convert<DstData>{}(src_buf[Number<src_offset>{}]);
+            });
+
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            // copy data from dst_vector into dst_buf
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    template <typename SrcSliceOriginIdx, typename SrcBuffer, typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcSliceOriginIdx&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
+
+        constexpr auto dst_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        Run(SrcDesc{}, SrcSliceOriginIdx{}, src_buf, dst_desc, dst_buf, dst_step_hacks);
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in Run(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                   dst_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    DstCoord dst_coord_;
+}; // namespace ck
+
+// Assume:
+//   1. src:
+//     1. SrcDesc is not known at compile-time
+//     2. SrcBuffer is DynamicBuffer
+//     3. src_slice_origin_idx is not known at compile-time
+//   2. dst:
+//     1. DstDesc is known at compile-time
+//     2. DstBuffer is StaticBuffer
+//     3. dst_slice_origin_idx is known at compile-time
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun,
+          typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v2
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v2(const SrcDesc& src_desc,
+                                                          const Index& src_slice_origin_idx)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx))
+    {
+        static_assert(DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc need to known at compile-time");
+    }
+
+    __device__ void SetDstSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer,
+              typename DstBuffer,
+              typename DstSliceOriginIdx,
+              typename SrcStepHacks>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstSliceOriginIdx&,
+                        DstBuffer& dst_buf,
+                        const SrcStepHacks& src_step_hacks)
+    {
+        static_assert(DstDesc::IsKnownAtCompileTime(),
+                      "wrong! DstDesc need to known at compile-time");
+
+        static_assert(
+            is_known_at_compile_time<remove_cv_t<remove_reference_t<DstSliceOriginIdx>>>::value,
+            "wrong! DstSliceOrigin need to known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename DstBuffer::type>>,
+                              remove_cv_t<remove_reference_t<DstData>>>::value &&
+                      "wrong! inconsistent type");
+
+        // DstDesc and dst_slice_origin_idx are known at compile-time
+        constexpr auto dst_desc             = remove_cv_t<remove_reference_t<DstDesc>>{};
+        constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{};
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i]
+                                         ? ordered_access_idx[i]
+                                         : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                       src_scalar_per_access;
+            }();
+
+            typename vector_type_maker<SrcData, SrcScalarPerVector>::type src_vector;
+
+            using src_vector_t =
+                typename vector_type_maker<SrcData, SrcScalarPerVector>::type::type;
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf into src_vector
+            src_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+
+            // copy data from src_vector into dst_buf
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t dst_offset =
+                    dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
+                                             i * src_scalar_step_in_vector);
+
+                dst_buf(Number<dst_offset>{}) = src_vector.template AsType<SrcData>()[i];
+            });
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    template <typename SrcBuffer, typename DstBuffer, typename DstSliceOriginIdx>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstSliceOriginIdx&,
+                        DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
+
+        constexpr auto src_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        Run(src_desc, src_buf, DstDesc{}, DstSliceOriginIdx{}, dst_buf, src_step_hacks);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in Run(), if it has not being reset by
+        // RunWrite()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                   src_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    private:
+    SrcCoord src_coord_;
+}; // namespace ck
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+struct ThreadwiseTensorSliceTransfer_v3
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc,
+                                                          const Index& src_slice_origin,
+                                                          const DstDesc& dst_desc,
+                                                          const Index& dst_slice_origin)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
+    {
+        // TODO: fix this
+        static_assert(is_same<SrcData, DstData>::value,
+                      "wrong! current implementation assume SrcData and DstData are same type");
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+                              remove_cv_t<remove_reference_t<SrcData>>>::value,
+                      "wrong! SrcBuffer and SrcData data type are inconsistent");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_scalar_per_access;
+            }();
+
+            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
+
+            using src_vector_t = typename decltype(src_tmp_vector)::type;
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf to src_tmp_vector
+            src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+
+            // copy data from src_tmp_vector to buffer_
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
+
+                buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
+            });
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    template <typename DstBuffer, typename DstStepHacks>
+    __device__ void
+    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
+    {
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename DstBuffer::type>>,
+                              remove_cv_t<remove_reference_t<DstData>>>::value,
+                      "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // src scalar per access on each dim
+        // TODO: don't use this
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
+                                                      : ordered_dst_access_lengths[i] - 1 -
+                                                            ordered_dst_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                       dst_scalar_per_access;
+            }();
+
+            vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
+
+            // copy data from buffer_ to dst_tmp_vector
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
+
+                dst_tmp_vector.template AsType<DstData>()(i) = buffer_[Number<buffer_offset>{}];
+            });
+
+            using dst_vector_t = typename decltype(dst_tmp_vector)::type;
+
+            // copy data from dst_tmp_vector to dst_buf
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
+
+        constexpr auto src_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunRead(src_desc, src_buf, src_step_hacks);
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
+
+        constexpr auto dst_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunWrite(dst_desc, dst_buf, dst_step_hacks);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    static constexpr auto buffer_desc_ =
+        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
+
+    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
+
+    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+};
+
+// Assume:
+//   1. src:
+//     1. SrcDesc is known at compile-time
+//     2. SrcBuffer is DynamicBuffer
+//     3. src_ref_idx is known at run-time
+//     4. SrcRefToOriginDisplacement is known at compile-time
+//     5. use #-step
+//   2. dst:
+//     1. DstDesc is known at compile-time
+//     2. DstBuffer is StaticBuffer
+//     3. DstOriginIdx is known at compile-time
+//     4. use direct address calculation
+//   3. vector access on src
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v4
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v4(const Index& src_ref_idx)
+        : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0, "wrong!");
+    }
+
+    template <typename SrcRefToOriginDisplacement,
+              typename DstOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcRefToOriginDisplacement&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstOriginIdx&,
+                        DstBuffer& dst_buf) const
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+                              remove_cv_t<remove_reference_t<SrcData>>>::value &&
+                          is_same<remove_cv_t<remove_reference_t<typename DstBuffer::type>>,
+                                  remove_cv_t<remove_reference_t<DstData>>>::value,
+                      "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+
+        static_assert(
+            is_known_at_compile_time<
+                remove_cv_t<remove_reference_t<SrcRefToOriginDisplacement>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<DstOriginIdx>>>::value,
+            "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
+            "at compile-time");
+
+        // SrcDesc and DstDesc are known at compile-time
+        constexpr auto src_desc = remove_cv_t<remove_reference_t<SrcDesc>>{};
+        constexpr auto dst_desc = remove_cv_t<remove_reference_t<DstDesc>>{};
+
+        // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
+        constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
+        constexpr auto dst_origin_idx             = to_multi_index(DstOriginIdx{});
+
+        // scalar per access of each dim
+        constexpr auto src_scalar_per_access = generate_sequence_v2(
+            [&](auto i) constexpr {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Number<SrcScalarPerVector>{};
+                }
+                else
+                {
+                    return Number<1>{};
+                }
+            },
+            Number<nDim>{});
+
+        // scalar step (if steping on SrcVectorDim) of each dim
+        constexpr auto src_scalar_step_in_vector = generate_sequence_v2(
+            [&](auto i) constexpr {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Number<1>{};
+                }
+                else
+                {
+                    return Number<0>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+#if 0
+            // TODO: unable to compile
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                container_reorder_given_old2new(ordered_access_idx, dim_access_order) *
+                src_scalar_per_access;
+#else
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
+#endif
+            // src coordinate
+            constexpr auto src_ref_to_data_disp_idx =
+                src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
+
+            constexpr auto src_ref_to_data_disp_coord_step =
+                make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
+
+            auto src_data_coord = src_ref_coord_;
+
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
+
+            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
+
+            using src_vector_t = typename decltype(src_tmp_vector)::type;
+
+            const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                src_desc, src_data_coord);
+
+            // copy data from src_buf into src_tmp_vector
+            src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_buf.template Get<src_vector_t>(src_data_coord.GetOffset(), is_src_valid);
+
+            // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
+            // DstData)
+            vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
+
+            // TODO: if SrcData and DstData are vetor type, then static_cast may not compile
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                dst_tmp_vector.template AsType<DstData>()(i) =
+                    type_convert<DstData>{}(src_tmp_vector.template AsType<SrcData>()[i]);
+            });
+
+            // copy data from dst_tmp_vector into dst_buf
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                    dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
+
+                dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+            });
+        });
+    }
+
+    template <typename SrcSliceMoveStepIdx>
+    __device__ void MoveSrcSliceWindow(const SrcDesc&,
+                                       const SrcSliceMoveStepIdx& src_slice_move_step_idx)
+    {
+        constexpr auto src_desc = SrcDesc{};
+
+        const auto src_slice_move_step_iter =
+            make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx));
+
+        move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
+    }
+
+    private:
+    SrcCoord src_ref_coord_;
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
+#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP
+#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          typename SrcVectorTensorLengths,
+          typename DstVectorTensorLengths,
+          typename SrcVectorTensorContiguousDimOrder,
+          typename DstVectorTensorContiguousDimOrder,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+struct ThreadwiseTensorSliceTransfer_v3r1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(const SrcDesc& src_desc,
+                                                            const Index& src_slice_origin,
+                                                            const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
+    {
+        // TODO: fix this
+        static_assert(is_same<SrcData, DstData>::value,
+                      "wrong! current implementation assume SrcData and DstData are same type");
+
+        static_for<0, nDim, 1>{}([](auto i) {
+            static_assert(SliceLengths::At(i) % SrcVectorTensorLengths::At(i) == 0 &&
+                              SliceLengths::At(i) % DstVectorTensorLengths::At(i) == 0,
+                          "wrong!");
+        });
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+                              remove_cv_t<remove_reference_t<SrcData>>>::value,
+                      "wrong! SrcBuffer and SrcData data type are inconsistent");
+
+        // tensor descriptor for src_vector
+        constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
+
+        constexpr auto src_vector_tensor_strides = container_reorder_given_old2new(
+            container_reverse_exclusive_scan(
+                container_reorder_given_new2old(src_vector_tensor_lengths,
+                                                SrcVectorTensorContiguousDimOrder{}),
+                math::multiplies{},
+                I1),
+            SrcVectorTensorContiguousDimOrder{});
+
+        constexpr auto src_vector_desc =
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(src_vector_tensor_strides));
+
+        // access order and lengths
+        constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_vector_tensor_lengths;
+            }();
+
+            vector_type_maker_t<SrcData, src_vector_desc.GetElementSpaceSize()> src_vector;
+
+            using src_vector_t = typename decltype(src_vector)::type;
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf to src_vector
+            src_vector.template AsType<src_vector_t>()(I0) =
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+
+            // copy data from src_vector to buffer_
+            static_ford<SrcVectorTensorLengths>{}([&](auto src_vector_idx_) {
+                constexpr auto src_vector_idx = to_multi_index(src_vector_idx_);
+
+                constexpr index_t src_vector_offset =
+                    src_vector_desc.CalculateOffset(src_vector_idx);
+
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(src_data_idx + src_vector_idx);
+
+                buffer_(Number<buffer_offset>{}) =
+                    src_vector.template AsType<SrcData>()[Number<src_vector_offset>{}];
+            });
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    template <typename DstBuffer, typename DstStepHacks>
+    __device__ void
+    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
+    {
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename DstBuffer::type>>,
+                              remove_cv_t<remove_reference_t<DstData>>>::value,
+                      "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        // tensor descriptor for dst_vector
+        constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{};
+
+        constexpr auto dst_vector_tensor_strides = container_reorder_given_old2new(
+            container_reverse_exclusive_scan(
+                container_reorder_given_new2old(dst_vector_tensor_lengths,
+                                                DstVectorTensorContiguousDimOrder{}),
+                math::multiplies{},
+                I1),
+            DstVectorTensorContiguousDimOrder{});
+
+        constexpr auto dst_vector_desc =
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(dst_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(dst_vector_tensor_strides));
+
+        // dst access order and lengths
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
+                                                      : ordered_dst_access_lengths[i] - 1 -
+                                                            ordered_dst_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                       dst_vector_tensor_lengths;
+            }();
+
+            vector_type_maker_t<DstData, dst_vector_desc.GetElementSpaceSize()> dst_vector;
+
+            // copy data from buffer_ to dst_vector (also cast from SrcData to DstData)
+            static_ford<DstVectorTensorLengths>{}([&](auto dst_vector_idx_) {
+                constexpr auto dst_vector_idx = to_multi_index(dst_vector_idx_);
+
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(dst_data_idx + dst_vector_idx);
+
+                constexpr index_t dst_vector_offset =
+                    dst_vector_desc.CalculateOffset(dst_vector_idx);
+
+                dst_vector.template AsType<DstData>()(Number<dst_vector_offset>{}) =
+                    type_convert<DstData>{}(buffer_[Number<buffer_offset>{}]);
+            });
+
+            using dst_vector_t = typename decltype(dst_vector)::type;
+
+            // copy data from dst_vector to dst_buf
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
+
+        constexpr auto src_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunRead(src_desc, src_buf, src_step_hacks);
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
+
+        constexpr auto dst_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunWrite(dst_desc, dst_buf, dst_step_hacks);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_vector_tensor_lengths;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{};
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_vector_tensor_lengths;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    static constexpr auto buffer_desc_ =
+        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
+
+    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
+
+    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+};
+
+// Assume:
+//   1. src:
+//     1. SrcDesc is known at compile-time
+//     2. SrcBuffer is DynamicBuffer
+//     3. src_ref_idx is known at run-time
+//     4. SrcRefToOriginDisplacement is known at compile-time
+//     5. use #-step
+//   2. dst:
+//     1. DstDesc is known at compile-time
+//     2. DstBuffer is StaticBuffer
+//     3. DstOriginIdx is known at compile-time
+//     4. use direct address calculation
+//   3. vector access on src
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          typename SrcVectorTensorLengths,
+          typename SrcVectorTensorContiguousDimOrder,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v4r1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v4r1(const Index& src_ref_idx)
+        : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_for<0, nDim, 1>{}([](auto i) {
+            static_assert(SliceLengths::At(i) % SrcVectorTensorLengths::At(i) == 0, "wrong!");
+        });
+    }
+
+    template <typename SrcRefToOriginDisplacement,
+              typename DstOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcRefToOriginDisplacement&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstOriginIdx&,
+                        DstBuffer& dst_buf) const
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+                              remove_cv_t<remove_reference_t<SrcData>>>::value &&
+                          is_same<remove_cv_t<remove_reference_t<typename DstBuffer::type>>,
+                                  remove_cv_t<remove_reference_t<DstData>>>::value,
+                      "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+
+        static_assert(
+            is_known_at_compile_time<
+                remove_cv_t<remove_reference_t<SrcRefToOriginDisplacement>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<DstOriginIdx>>>::value,
+            "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
+            "at compile-time");
+
+        // SrcDesc and DstDesc are known at compile-time
+        constexpr auto src_desc = remove_cv_t<remove_reference_t<SrcDesc>>{};
+        constexpr auto dst_desc = remove_cv_t<remove_reference_t<DstDesc>>{};
+
+        // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
+        constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
+        constexpr auto dst_origin_idx             = to_multi_index(DstOriginIdx{});
+
+        // tensor descriptor for src_vector
+        constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
+
+        constexpr auto src_vector_tensor_strides = container_reorder_given_old2new(
+            container_reverse_exclusive_scan(
+                container_reorder_given_new2old(src_vector_tensor_lengths,
+                                                SrcVectorTensorContiguousDimOrder{}),
+                math::multiplies{},
+                I1),
+            SrcVectorTensorContiguousDimOrder{});
+
+        constexpr auto src_vector_desc =
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(src_vector_tensor_strides));
+
+        // access order and lengths
+        constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                ordered_access_idx.ReorderGivenOld2New(dim_access_order) *
+                src_vector_tensor_lengths;
+
+            // src coordinate at starting point of src_vector
+            constexpr auto src_ref_to_data_disp_idx =
+                src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
+
+            constexpr auto src_ref_to_data_disp_coord_step =
+                make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
+
+            auto src_data_coord = src_ref_coord_;
+
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
+
+            vector_type_maker_t<SrcData, src_vector_desc.GetElementSpaceSize()> src_vector;
+
+            using src_vector_t = typename decltype(src_vector)::type;
+
+            const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                src_desc, src_data_coord);
+
+            // copy data from src_buf into src_vector
+            src_vector.template AsType<src_vector_t>()(I0) =
+                src_buf.template Get<src_vector_t>(src_data_coord.GetOffset(), is_src_valid);
+
+            // copy data from src_vector into dst_buf (also cast from SrcData to DstData)
+            static_ford<SrcVectorTensorLengths>{}([&](auto src_vector_idx_) {
+                constexpr auto src_vector_idx = to_multi_index(src_vector_idx_);
+
+                constexpr index_t src_vector_offset =
+                    src_vector_desc.CalculateOffset(src_vector_idx);
+
+                constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                    dst_origin_idx + data_to_origin_disp_idx + src_vector_idx);
+
+                dst_buf(Number<dst_offset>{}) = type_convert<DstData>{}(
+                    src_vector.template AsType<DstData>()[Number<src_vector_offset>{}]);
+            });
+        });
+    }
+
+    template <typename SrcSliceMoveStepIdx>
+    __device__ void MoveSrcSliceWindow(const SrcDesc&,
+                                       const SrcSliceMoveStepIdx& src_slice_move_step_idx)
+    {
+        constexpr auto src_desc = SrcDesc{};
+
+        const auto src_slice_move_step_iter =
+            make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx));
+
+        move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
+    }
+
+    private:
+    SrcCoord src_ref_coord_;
+};
+
+} // namespace ck
+#endif