Merge remote-tracking branch 'origin/develop' into cpu_avx2

07a673c6 · carlushuang · c0f698d5 · ac0d8066 · 07a673c6 · 07a673c6
Commit 07a673c6 authored Apr 14, 2022 by carlushuang
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -59,7 +59,7 @@ template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
          typename ABK0MK1GridDesc,
          typename BBK0NK1GridDesc,
          typename CMNGridDesc,
@@ -316,11 +316,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
                               const CElementwiseOperation& c_element_op,
                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid, b_b_k0_n_k1_grid_desc.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid, c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc.GetElementSpaceSize());

        const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
@@ -410,7 +410,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              AElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<1, K0PerBlock, MPerBlock, K1>,
                                              ABlockTransferThreadClusterLengths_K0_M_K1,
                                              ABlockTransferThreadClusterArrangeOrder,
@@ -440,7 +440,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              BElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<1, K0PerBlock, NPerBlock, K1>,
                                              BBlockTransferThreadClusterLengths_K0_N_K1,
                                              BBlockTransferThreadClusterArrangeOrder,
@@ -497,9 +497,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
        constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
        constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);

-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());

        // preload data into LDS

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -61,7 +61,7 @@ template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
          typename AGridDesc_B_K0_M_K1,
          typename BGridDesc_B_K0_N_K1,
          typename CMNGridDesc,
@@ -305,11 +305,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                               const CElementwiseOperation& c_element_op,
                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid, b_b_k0_n_k1_grid_desc.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());

        const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
@@ -399,7 +399,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              AElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<1, K0PerBlock, MPerBlock, K1>,
                                              ABlockTransferThreadClusterLengths_K0_M_K1,
                                              ABlockTransferThreadClusterArrangeOrder,
@@ -429,7 +429,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              BElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<1, K0PerBlock, NPerBlock, K1>,
                                              BBlockTransferThreadClusterLengths_K0_N_K1,
                                              BBlockTransferThreadClusterArrangeOrder,
@@ -486,9 +486,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
        constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);

-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());

        // preload data into LDS
@@ -560,7 +560,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            constexpr auto c_block_desc_mblock_mperblock_nblock_nperblock =
                GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();

-            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                static_cast<FloatC*>(p_shared_block),
                c_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());

@@ -632,7 +632,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
                                                   7,
                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
+                                                   InMemoryDataOperationEnum::Set,
                                                   1,
                                                   true>{
                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -64,7 +64,7 @@ template <
    typename FloatAcc,
    typename FloatCShuffle,
    typename FloatC,
-    InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+    InMemoryDataOperationEnum CGlobalMemoryDataOperation,
    typename AGridDesc_AK0_M_AK1,
    typename BGridDesc_BK0_N_BK1,
    typename CGridDesc_M_N,
@@ -369,11 +369,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
        const CElementwiseOperation& c_element_op,
        const Block2CTileMap& block_2_ctile_map)
    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid,
            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
@@ -403,7 +403,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              AElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<AK0, MPerBlock, AK1>,
                                              ABlockTransferThreadClusterLengths_AK0_M_AK1,
                                              ABlockTransferThreadClusterArrangeOrder,
@@ -434,7 +434,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              BElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<BK0, NPerBlock, BK1>,
                                              BBlockTransferThreadClusterLengths_BK0_N_BK1,
                                              BBlockTransferThreadClusterArrangeOrder,
@@ -488,10 +488,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);

-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());

-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
            b_block_desc_bk0_n_bk1.GetElementSpaceSize());

@@ -567,7 +567,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
            constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
                GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();

-            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                static_cast<FloatCShuffle*>(p_shared),
                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                    .GetElementSpaceSize());
@@ -644,7 +644,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
                                                   7,
                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
+                                                   InMemoryDataOperationEnum::Set,
                                                   1,
                                                   true>{
                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -68,7 +68,7 @@ template <
    typename FloatAB,
    typename FloatAcc,
    typename FloatC,
-    InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+    InMemoryDataOperationEnum CGlobalMemoryDataOperation,
    typename AGridDesc_K0_M_K1,
    typename BGridDesc_K0_N_K1,
    typename CGridDesc_M_N,
@@ -382,15 +382,15 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
        const CElementwiseOperation& c_element_op,
        const Block2CTileMap& block_2_ctile_map)
    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid,
            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
-        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c0_grid,
            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
@@ -422,7 +422,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              AElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<K0PerBlock, MPerBlock, K1>,
                                              ABlockTransferThreadClusterLengths_K0_M_K1,
                                              ABlockTransferThreadClusterArrangeOrder,
@@ -453,7 +453,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              BElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<K0PerBlock, NPerBlock, K1>,
                                              BBlockTransferThreadClusterLengths_K0_N_K1,
                                              BBlockTransferThreadClusterArrangeOrder,
@@ -505,10 +505,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
        constexpr auto a_block_space_size_aligned =
            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);

-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());

-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
            b_block_desc_k0_n_k1.GetElementSpaceSize());

@@ -582,7 +582,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
            constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
                GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();

-            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                static_cast<FloatC*>(p_shared),
                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                    .GetElementSpaceSize());
@@ -661,7 +661,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
                                                   7,
                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
+                                                   InMemoryDataOperationEnum::Set,
                                                   1,
                                                   true>{
                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -74,7 +74,7 @@ template <
    typename FloatAB,
    typename FloatAcc,
    typename FloatC,
-    InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+    InMemoryDataOperationEnum CGlobalMemoryDataOperation,
    typename AGridDesc_K0_M_K1,
    typename BGridDesc_K0_N_K1,
    typename CGridDesc_M_N,
@@ -397,19 +397,19 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
        const CElementwiseOperation& c_element_op,
        const Block2CTileMap& block_2_ctile_map)
    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid,
            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
-        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c0_grid,
            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
-        auto c1_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c1_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c1_grid,
            c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
@@ -441,7 +441,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              AElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<K0PerBlock, MPerBlock, K1>,
                                              ABlockTransferThreadClusterLengths_K0_M_K1,
                                              ABlockTransferThreadClusterArrangeOrder,
@@ -471,7 +471,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              BElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<K0PerBlock, NPerBlock, K1>,
                                              BBlockTransferThreadClusterLengths_K0_N_K1,
                                              BBlockTransferThreadClusterArrangeOrder,
@@ -522,10 +522,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
        constexpr auto a_block_space_size_aligned =
            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);

-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());

-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
            b_block_desc_k0_n_k1.GetElementSpaceSize());

@@ -599,7 +599,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
            constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
                GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();

-            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                static_cast<FloatC*>(p_shared),
                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                    .GetElementSpaceSize());
@@ -678,7 +678,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
                                                   7,
                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
+                                                   InMemoryDataOperationEnum::Set,
                                                   1,
                                                   true>{
                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
@@ -45,13 +45,13 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe

    const index_t thread_global_id = block_global_id * BlockSize + thread_local_id;

-    StaticBuffer<AddressSpaceEnum_t::Vgpr, DataType, 1, true> value_buf;
+    StaticBuffer<AddressSpaceEnum::Vgpr, DataType, 1, true> value_buf;

    value_buf(I0) = value;

    constexpr auto val_buff_desc = make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));

-    auto global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+    auto global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
        p_global, grid_1d_buffer_desc.GetElementSpaceSize());

    if(thread_global_id < grid_1d_buffer_desc.GetElementSize())
@@ -65,7 +65,7 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe
                                                                   Sequence<0>,
                                                                   0,
                                                                   1,
-                                                                   InMemoryDataOperationEnum_t::Set,
+                                                                   InMemoryDataOperationEnum::Set,
                                                                   1,
                                                                   true>(
            grid_1d_buffer_desc, make_multi_index(thread_global_id), PassThroughOp{});

--- a/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
+#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
+
+#include "reduction_functions_accumulate.hpp"
+
+namespace ck {
+
+// Assume
+//  1) SrcDesc is known at compile-time
+//  2) DstDesc is known at compile-time
+//  3) SrcBuffer is static buffer
+//  4) DstBuffer is static buffer
+template <typename AccDataType,
+          typename SrcThreadDesc_M_K,
+          typename DstThreadDesc_M,
+          typename OpReduce,
+          bool PropagateNan>
+struct ThreadwiseReduction
+{
+    static constexpr auto src_thread_desc_m_k = SrcThreadDesc_M_K{};
+    static constexpr auto dst_thread_desc_m   = DstThreadDesc_M{};
+
+    static constexpr auto src_length_m = src_thread_desc_m_k.GetLength(Number<0>{});
+    static constexpr auto src_length_k = src_thread_desc_m_k.GetLength(Number<1>{});
+    static constexpr auto dst_length_m = dst_thread_desc_m.GetLength(Number<0>{});
+
+    static_assert(src_length_m == dst_length_m, "lengths of source and dst buffer must match!");
+
+    using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>;
+
+    template <typename SrcBufferType, typename DstBufferType>
+    __device__ static void Reduce(const SrcBufferType& src_buf, DstBufferType& dst_buf)
+    {
+        static_for<0, src_length_m, 1>{}([&](auto iM) {
+            constexpr index_t out_offset = dst_thread_desc_m.CalculateOffset(make_tuple(iM));
+
+            static_for<0, src_length_k, 1>{}([&](auto iK) {
+                constexpr auto offset = src_thread_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                Accumulation::Calculate(dst_buf(Number<out_offset>{}), src_buf[Number<offset>{}]);
+            });
+        });
+    };
+};
+
+// Assume
+//  1) SrcDesc is known at compile-time
+//  2) DstDesc is known at compile-time
+//  3) SrcBuffer is static buffer
+//  4) DstBuffer is static buffer
+template <typename AccDataType,
+          typename IndexDataType,
+          typename SrcThreadDesc_M_K,
+          typename DstThreadDesc_M,
+          typename OpReduce,
+          bool PropagateNan>
+struct ThreadwiseReductionWithIndex
+{
+    static constexpr auto src_thread_desc_m_k = SrcThreadDesc_M_K{};
+    static constexpr auto dst_thread_desc_m   = DstThreadDesc_M{};
+
+    static constexpr auto src_length_m = src_thread_desc_m_k.GetLength(Number<0>{});
+    static constexpr auto src_length_k = src_thread_desc_m_k.GetLength(Number<1>{});
+    static constexpr auto dst_length_m = dst_thread_desc_m.GetLength(Number<0>{});
+
+    static_assert(src_length_m == dst_length_m, "lengths of source and dst buffer must match!");
+
+    using Accumulation =
+        detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>;
+
+    template <typename SrcValueBufferType,
+              typename SrcIndexBufferType,
+              typename DstValueBufferType,
+              typename DstIndexBufferType>
+    __device__ static void Reduce(const SrcValueBufferType& src_val_buf,
+                                  const SrcIndexBufferType& src_idx_buf,
+                                  DstValueBufferType& dst_val_buf,
+                                  DstIndexBufferType& dst_idx_buf)
+    {
+        static_for<0, src_length_m, 1>{}([&](auto iM) {
+            constexpr index_t out_offset = dst_thread_desc_m.CalculateOffset(make_tuple(iM));
+
+            static_for<0, src_length_k, 1>{}([&](auto iK) {
+                constexpr auto offset = src_thread_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                Accumulation::Calculate(dst_val_buf(Number<out_offset>{}),
+                                        src_val_buf[Number<offset>{}],
+                                        dst_idx_buf(Number<out_offset>{}),
+                                        src_idx_buf[Number<offset>{}]);
+            });
+        });
+    };
+};
+
+}; // end of namespace ck
+
+#endif
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -56,7 +56,7 @@ template <typename SrcData,
          typename DimAccessOrder,
          index_t DstVectorDim,
          index_t DstScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          index_t DstScalarStrideInVector,
          bool DstResetCoordinateAfterRun,
          typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
@@ -407,7 +407,7 @@ struct ThreadwiseTensorSliceTransfer_v2
 //   3. src_slice_origin and dst_slice_origin are not known at compile-time,
 //   4. Use thread buffer
 template <typename SliceLengths,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          typename SrcData,
          typename DstData,
          typename SrcDesc,
@@ -464,8 +464,8 @@ struct ThreadwiseTensorSliceTransfer_v3
    __device__ void
    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
    {
-        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
-                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");

        static_assert(
@@ -621,8 +621,8 @@ struct ThreadwiseTensorSliceTransfer_v3
    __device__ void
    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
    {
-        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
-                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");

        static_assert(
@@ -979,7 +979,7 @@ struct ThreadwiseTensorSliceTransfer_v3

    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();

-    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
+    StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;

    SrcCoord src_coord_;
    DstCoord dst_coord_;

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r4.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r4.hpp
-#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R4_HPP
-#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R4_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
-// and sometimes useless instructions:
-//   1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
-//   instead
-//   2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
-//   tensor coordinate instead
-//   3. Don't use a pointer to VGPR buffer, use vector instead
-
-// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-// TODO: fix this
-// Assume:
-//   1. src:
-//     1. SrcDesc is known at compile-time
-//     2. SrcBuffer is StaticBuffer
-//     3. SrcSliceOrginIdx is known at compile-time
-//   2. dst:
-//     1. DstDesc is not known at compile-time
-//     2. DstBuffer is DynamicBuffer
-//     3. DstSliceOrginIdx is not known at compile time
-template <typename SrcData,
-          typename DstData,
-          typename SrcDesc,
-          typename DstDesc,
-          typename Dst0Desc, // this is really one of sources, but it has same shape as DstDesc
-          typename Dst1Desc, // this is really one of sources, but it has same shape as DstDesc
-          typename DstElementwiseOperation,
-          typename SliceLengths,
-          typename DimAccessOrder,
-          index_t DstVectorDim,
-          index_t DstScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
-          index_t DstScalarStrideInVector,
-          bool DstResetCoordinateAfterRun,
-          typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
-struct ThreadwiseTensorSliceTransfer_v1r4
-{
-    static constexpr index_t nDim = SliceLengths::Size();
-
-    using Index = MultiIndex<nDim>;
-
-    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
-    using Dst0Coord = decltype(make_tensor_coordinate(Dst0Desc{}, Index{}));
-    using Dst1Coord = decltype(make_tensor_coordinate(Dst1Desc{}, Index{}));
-
-    using DstCoordStep  = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
-    using Dst0CoordStep = decltype(make_tensor_coordinate_step(Dst0Desc{}, Index{}));
-    using Dst1CoordStep = decltype(make_tensor_coordinate_step(Dst1Desc{}, Index{}));
-
-    __device__ constexpr ThreadwiseTensorSliceTransfer_v1r4(
-        const DstDesc& dst_desc,
-        const Dst0Desc& dst0_desc,
-        const Dst1Desc& dst1_desc,
-        const Index& dst_slice_origin_idx,
-        const DstElementwiseOperation& dst_element_op)
-        : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)),
-          dst0_coord_(make_tensor_coordinate(dst0_desc, dst_slice_origin_idx)),
-          dst1_coord_(make_tensor_coordinate(dst1_desc, dst_slice_origin_idx)),
-          dst_element_op_{dst_element_op}
-    {
-        static_assert(SrcDesc::IsKnownAtCompileTime(),
-                      "wrong! SrcDesc need to known at compile-time");
-    }
-
-    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
-    {
-        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
-    }
-
-    template <typename SrcSliceOriginIdx,
-              typename SrcBuffer,
-              typename DstBuffer,
-              typename Dst0Buffer,
-              typename Dst1Buffer,
-              typename DstStepHacks,
-              typename Dst0StepHacks,
-              typename Dst1StepHacks>
-    __device__ void Run(const SrcDesc&,
-                        const SrcSliceOriginIdx&,
-                        const SrcBuffer& src_buf,
-                        const DstDesc& dst_desc,
-                        DstBuffer& dst_buf,
-                        const DstStepHacks& dst_step_hacks,
-                        const Dst0Desc& dst0_desc,
-                        const Dst0Buffer& dst0_buf,
-                        const Dst0StepHacks& dst0_step_hacks,
-                        const Dst1Desc& dst1_desc,
-                        const Dst1Buffer& dst1_buf,
-                        const Dst1StepHacks& dst1_step_hacks)
-    {
-        static_assert(SrcDesc::IsKnownAtCompileTime(),
-                      "wrong! SrcDesc need to known at compile-time");
-
-        static_assert(is_known_at_compile_time<remove_cvref_t<SrcSliceOriginIdx>>::value,
-                      "wrong! SrcSliceOrigin need to known at compile-time");
-
-        static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
-
-        // SrcDesc and src_slice_origin_idx are known at compile-time
-        constexpr auto src_desc             = remove_cvref_t<SrcDesc>{};
-        constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dim_access_order = DimAccessOrder{};
-
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-
-        // make forward steps: dst
-        const auto dst_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-
-        // make forward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst0_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst0_desc, forward_step_idx, dst0_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-
-        // make forward steps: dst1
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst1_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst1_desc, forward_step_idx, dst1_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-
-        // make backward steps: dst
-        const auto dst_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-
-        // make backward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst0_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst0_desc, backward_step_idx, dst0_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-
-        // make backward steps: dst1
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst1_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst1_desc, backward_step_idx, dst1_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-
-        // loop over tensor and copy
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_access_idx[I0];
-
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
-
-            // calculate dst data index
-            constexpr auto dst_data_idx = [&]() {
-                Index ordered_idx;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i]
-                                         ? ordered_access_idx[i]
-                                         : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
-                });
-
-                return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                       dst_scalar_per_access;
-            }();
-
-            typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
-
-            using dst_vector_t =
-                typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
-
-            // load dst0 and dst1 and apply elementwise operation
-            {
-                // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-                // TODO: fix this
-                static_assert(DstScalarPerVector == 1, "wrong!");
-
-                // copy data from src_buf into dst_vector_src_data
-                constexpr index_t src_offset =
-                    src_desc.CalculateOffset(src_slice_origin_idx + dst_data_idx);
-
-                const SrcData src_v = src_buf[Number<src_offset>{}];
-
-                // load dst0 and dst1
-                const bool is_dst0_valid =
-                    coordinate_has_valid_offset_assuming_visible_index_is_valid(dst0_desc,
-                                                                                dst0_coord_);
-                const bool is_dst1_valid =
-                    coordinate_has_valid_offset_assuming_visible_index_is_valid(dst1_desc,
-                                                                                dst1_coord_);
-
-                const DstData dst0_v =
-                    dst0_buf.template Get<DstData>(dst0_coord_.GetOffset(), is_dst0_valid);
-                const DstData dst1_v =
-                    dst1_buf.template Get<DstData>(dst1_coord_.GetOffset(), is_dst1_valid);
-
-#if !CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE
-                // apply element-wise operation in SrcData type
-                const SrcData dst_v = dst_element_op_(
-                    src_v, type_convert<SrcData>(dst0_v), type_convert<SrcData>(dst1_v));
-
-                // apply type convert
-                dst_vector.template AsType<DstData>()(Number<0>{}) = type_convert<DstData>(dst_v);
-#else
-                // apply element-wise operation in DstData type
-                DstData dst_v;
-
-                dst_element_op_(dst_v, src_v, dst0_v, dst1_v);
-
-                dst_vector.template AsType<DstData>()(Number<0>{}) = dst_v;
-#endif
-            }
-
-            const bool is_dst_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
-
-            // copy data from dst_vector into dst_buf
-            if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
-            {
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
-            {
-                dst_buf.template AtomicAdd<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add)
-            {
-
-                typename vector_type_maker<DstData, DstScalarPerVector>::type tmp;
-                tmp.template AsType<dst_vector_t>()(Number<0>{}) =
-                    dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid);
-
-                static_for<0, DstScalarPerVector, 1>{}([&](auto t) {
-                    dst_vector.template AsType<DstData>()(t) += tmp.template AsType<DstData>()[t];
-                });
-
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-
-            constexpr auto move_on_dim = [&]() constexpr
-            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
-            }
-            ();
-
-            // move
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
-
-                        // dst0
-                        move_tensor_coordinate(
-                            dst0_desc, dst0_coord_, dst0_forward_steps[dim_access_order[i]]);
-
-                        // dst1
-                        move_tensor_coordinate(
-                            dst1_desc, dst1_coord_, dst1_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
-
-                        // dst0
-                        move_tensor_coordinate(
-                            dst0_desc, dst0_coord_, dst0_backward_steps[dim_access_order[i]]);
-
-                        // dst1
-                        move_tensor_coordinate(
-                            dst1_desc, dst1_coord_, dst1_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
-        });
-
-        // move dst coordinate back to slice origin (or not)
-        if constexpr(DstResetCoordinateAfterRun)
-        {
-            const auto dst_reset_step =
-                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
-
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
-        }
-    }
-
-    template <typename SrcSliceOriginIdx,
-              typename SrcBuffer,
-              typename DstBuffer,
-              typename Dst0Buffer,
-              typename Dst1Buffer>
-    __device__ void Run(const SrcDesc&,
-                        const SrcSliceOriginIdx&,
-                        const SrcBuffer& src_buf,
-                        const DstDesc& dst_desc,
-                        DstBuffer& dst_buf,
-                        const Dst0Desc& dst0_desc,
-                        const Dst0Buffer& dst0_buf,
-                        const Dst1Desc& dst1_desc,
-                        const Dst1Buffer& dst1_buf)
-    {
-        auto f_step_hacks = [&](auto desc) {
-            constexpr index_t ntransform = decltype(desc)::GetNumOfTransform();
-
-            constexpr auto zeros = typename uniform_sequence_gen<ntransform, 0>::type{};
-
-            constexpr auto step_hacks =
-                make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
-                           generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
-
-            return step_hacks;
-        };
-
-        Run(SrcDesc{},
-            SrcSliceOriginIdx{},
-            src_buf,
-            dst_desc,
-            dst_buf,
-            f_step_hacks(dst_desc),
-            dst0_desc,
-            dst0_buf,
-            f_step_hacks(dst0_desc),
-            dst1_desc,
-            dst1_buf,
-            f_step_hacks(dst1_desc));
-    }
-
-    __device__ static constexpr auto GetDstCoordinateResetStep()
-    {
-        constexpr auto I0 = Number<0>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dim_access_order = DimAccessOrder{};
-
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_access_lengths[I0] - 1;
-
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate dst data index after last iteration in Run(), if it has not being reset by
-        // RunWrite()
-        constexpr auto dst_data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   dst_scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_dst_data_step = [&]() {
-            Index reset_dst_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
-
-            return reset_dst_data_step_;
-        }();
-
-        return reset_dst_data_step;
-    }
-
-    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
-                                       const Index& dst_slice_origin_step_idx)
-    {
-        // if dst coord was not reset by Run(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
-                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
-
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
-
-        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
-    }
-
-    private:
-    DstCoord dst_coord_;
-    Dst0Coord dst0_coord_;
-    Dst1Coord dst1_coord_;
-    const DstElementwiseOperation dst_element_op_;
-}; // namespace ck
-
-} // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r5.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r5.hpp
-#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R5_HPP
-#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R5_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
-// and sometimes useless instructions:
-//   1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
-//   instead
-//   2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
-//   tensor coordinate instead
-//   3. Don't use a pointer to VGPR buffer, use vector instead
-
-// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-// TODO: fix this
-// Assume:
-//   1. src:
-//     1. SrcDesc is known at compile-time
-//     2. SrcBuffer is StaticBuffer
-//     3. SrcSliceOrginIdx is known at compile-time
-//   2. dst:
-//     1. DstDesc is not known at compile-time
-//     2. DstBuffer is DynamicBuffer
-//     3. DstSliceOrginIdx is not known at compile time
-template <typename SrcData,
-          typename DstData,
-          typename SrcDesc,
-          typename DstDesc,
-          typename Dst0Desc, // this is really one of sources, but it has same shape as DstDesc
-          typename DstElementwiseOperation,
-          typename SliceLengths,
-          typename DimAccessOrder,
-          index_t DstVectorDim,
-          index_t DstScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
-          index_t DstScalarStrideInVector,
-          bool DstResetCoordinateAfterRun,
-          typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
-struct ThreadwiseTensorSliceTransfer_v1r5
-{
-    static constexpr index_t nDim = SliceLengths::Size();
-
-    using Index = MultiIndex<nDim>;
-
-    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
-    using Dst0Coord = decltype(make_tensor_coordinate(Dst0Desc{}, Index{}));
-
-    using DstCoordStep  = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
-    using Dst0CoordStep = decltype(make_tensor_coordinate_step(Dst0Desc{}, Index{}));
-
-    __device__ constexpr ThreadwiseTensorSliceTransfer_v1r5(
-        const DstDesc& dst_desc,
-        const Dst0Desc& dst0_desc,
-        const Index& dst_slice_origin_idx,
-        const DstElementwiseOperation& dst_element_op)
-        : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)),
-          dst0_coord_(make_tensor_coordinate(dst0_desc, dst_slice_origin_idx)),
-          dst_element_op_{dst_element_op}
-    {
-        static_assert(SrcDesc::IsKnownAtCompileTime(),
-                      "wrong! SrcDesc need to known at compile-time");
-    }
-
-    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
-    {
-        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
-    }
-
-    template <typename SrcSliceOriginIdx,
-              typename SrcBuffer,
-              typename DstBuffer,
-              typename Dst0Buffer,
-              typename DstStepHacks,
-              typename Dst0StepHacks>
-    __device__ void Run(const SrcDesc&,
-                        const SrcSliceOriginIdx&,
-                        const SrcBuffer& src_buf,
-                        const DstDesc& dst_desc,
-                        DstBuffer& dst_buf,
-                        const DstStepHacks& dst_step_hacks,
-                        const Dst0Desc& dst0_desc,
-                        const Dst0Buffer& dst0_buf,
-                        const Dst0StepHacks& dst0_step_hacks)
-    {
-        static_assert(SrcDesc::IsKnownAtCompileTime(),
-                      "wrong! SrcDesc need to known at compile-time");
-
-        static_assert(is_known_at_compile_time<remove_cvref_t<SrcSliceOriginIdx>>::value,
-                      "wrong! SrcSliceOrigin need to known at compile-time");
-
-        static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
-
-        // SrcDesc and src_slice_origin_idx are known at compile-time
-        constexpr auto src_desc             = remove_cvref_t<SrcDesc>{};
-        constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dim_access_order = DimAccessOrder{};
-
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-
-        // make forward steps: dst
-        const auto dst_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-
-        // make forward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-        // TODO: fix this
-        const auto dst0_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst0_desc, forward_step_idx, dst0_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-
-        // make backward steps: dst
-        const auto dst_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-
-        // make backward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-        // TODO: fix this
-        const auto dst0_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst0_desc, backward_step_idx, dst0_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-
-        // loop over tensor and copy
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_access_idx[I0];
-
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
-
-            // calculate dst data index
-            constexpr auto dst_data_idx = [&]() {
-                Index ordered_idx;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i]
-                                         ? ordered_access_idx[i]
-                                         : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
-                });
-
-                return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                       dst_scalar_per_access;
-            }();
-
-            typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
-
-            using dst_vector_t =
-                typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
-
-            // load dst0 and apply elementwise operation
-            {
-                // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-                // TODO: fix this
-                static_assert(DstScalarPerVector == 1, "wrong!");
-
-                // copy data from src_buf into dst_vector_src_data
-                constexpr index_t src_offset =
-                    src_desc.CalculateOffset(src_slice_origin_idx + dst_data_idx);
-
-                const SrcData src_v = src_buf[Number<src_offset>{}];
-
-                // load dst0
-                const bool is_dst0_valid =
-                    coordinate_has_valid_offset_assuming_visible_index_is_valid(dst0_desc,
-                                                                                dst0_coord_);
-                const DstData dst0_v =
-                    dst0_buf.template Get<DstData>(dst0_coord_.GetOffset(), is_dst0_valid);
-
-#if !CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE
-                // apply element-wise operation in SrcData type
-                const SrcData dst_v = dst_element_op_(src_v, type_convert<SrcData>(dst0_v));
-
-                // apply type convert
-                dst_vector.template AsType<DstData>()(Number<0>{}) = type_convert<DstData>(dst_v);
-#else
-                // apply element-wise operation in DstData type
-                const DstData dst_v = dst_element_op_(src_v, dst0_v);
-
-                dst_vector.template AsType<DstData>()(Number<0>{}) = dst_v;
-#endif
-            }
-
-            const bool is_dst_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
-
-            // copy data from dst_vector into dst_buf
-            if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
-            {
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
-            {
-                dst_buf.template AtomicAdd<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add)
-            {
-
-                typename vector_type_maker<DstData, DstScalarPerVector>::type tmp;
-                tmp.template AsType<dst_vector_t>()(Number<0>{}) =
-                    dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid);
-
-                static_for<0, DstScalarPerVector, 1>{}([&](auto t) {
-                    dst_vector.template AsType<DstData>()(t) += tmp.template AsType<DstData>()[t];
-                });
-
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-
-            constexpr auto move_on_dim = [&]() constexpr
-            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
-            }
-            ();
-
-            // move
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
-
-                        // dst0
-                        move_tensor_coordinate(
-                            dst0_desc, dst0_coord_, dst0_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
-
-                        // dst0
-                        move_tensor_coordinate(
-                            dst0_desc, dst0_coord_, dst0_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
-        });
-
-        // move dst coordinate back to slice origin (or not)
-        if constexpr(DstResetCoordinateAfterRun)
-        {
-            const auto dst_reset_step =
-                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
-
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
-        }
-    }
-
-    template <typename SrcSliceOriginIdx,
-              typename SrcBuffer,
-              typename DstBuffer,
-              typename Dst0Buffer>
-    __device__ void Run(const SrcDesc&,
-                        const SrcSliceOriginIdx&,
-                        const SrcBuffer& src_buf,
-                        const DstDesc& dst_desc,
-                        DstBuffer& dst_buf,
-                        const Dst0Desc& dst0_desc,
-                        const Dst0Buffer& dst0_buf)
-    {
-        auto f_step_hacks = [&](auto desc) {
-            constexpr index_t ntransform = decltype(desc)::GetNumOfTransform();
-
-            constexpr auto zeros = typename uniform_sequence_gen<ntransform, 0>::type{};
-
-            constexpr auto step_hacks =
-                make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
-                           generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
-
-            return step_hacks;
-        };
-
-        Run(SrcDesc{},
-            SrcSliceOriginIdx{},
-            src_buf,
-            dst_desc,
-            dst_buf,
-            f_step_hacks(dst_desc),
-            dst0_desc,
-            dst0_buf,
-            f_step_hacks(dst0_desc));
-    }
-
-    __device__ static constexpr auto GetDstCoordinateResetStep()
-    {
-        constexpr auto I0 = Number<0>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dim_access_order = DimAccessOrder{};
-
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_access_lengths[I0] - 1;
-
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate dst data index after last iteration in Run(), if it has not being reset by
-        // RunWrite()
-        constexpr auto dst_data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   dst_scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_dst_data_step = [&]() {
-            Index reset_dst_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
-
-            return reset_dst_data_step_;
-        }();
-
-        return reset_dst_data_step;
-    }
-
-    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
-                                       const Index& dst_slice_origin_step_idx)
-    {
-        // if dst coord was not reset by Run(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
-                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
-
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
-
-        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
-    }
-
-    private:
-    DstCoord dst_coord_;
-    Dst0Coord dst0_coord_;
-    const DstElementwiseOperation dst_element_op_;
-}; // namespace ck
-
-} // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -48,7 +48,7 @@ struct lambda_scalar_per_access_for_src_and_dst
 template <typename SliceLengths,
          typename SrcElementwiseOperation,
          typename DstElementwiseOperation,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          typename SrcData,
          typename DstData,
          typename SrcDesc,
@@ -110,8 +110,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                            const SrcBuffer& src_buf,
                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
    {
-        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
-                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");

        static_assert(
@@ -271,7 +271,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        static_ford<SliceLengths>{}([&](auto idx) {
            // convert from SrcData to DstData here
            dst_thread_scratch_(idx) =
-                type_convert<DstData>(src_thread_scratch_tuple[thread_scratch_id][idx]);
+                type_convert<DstData>(src_thread_scratch_tuple_[thread_scratch_id][idx]);
        });
 #else
        // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
@@ -361,8 +361,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        // TODO move this elsewhere
        TransferDataFromSrcThreadScratchToDstThreadScratch(thread_scratch_id);

-        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
-                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");

        static_assert(
@@ -763,13 +763,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
    static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
    static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};

-    using SrcThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
+    using SrcThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
                                                             SrcData,
                                                             SrcScalarPerVector,
                                                             decltype(src_thread_scratch_desc_),
                                                             true>;

-    using DstThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
+    using DstThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
                                                             DstData,
                                                             DstScalarPerVector,
                                                             decltype(dst_thread_scratch_desc_),

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
@@ -48,7 +48,7 @@ struct lambda_scalar_per_access_for_src_and_dst
 template <typename SliceLengths,
          typename SrcElementwiseOperation,
          typename DstElementwiseOperation,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          typename SrcData,
          typename DstData,
          typename SrcDesc,
@@ -120,8 +120,8 @@ struct ThreadwiseTensorSliceTransfer_v3r3
    template <typename SrcBuffer>
    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
    {
-        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
-                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");

        static_assert(
@@ -369,8 +369,8 @@ struct ThreadwiseTensorSliceTransfer_v3r3
        // TODO move this elsewhere
        TransferDataFromSrcThreadScratchToDstThreadScratch();

-        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
-                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");

        static_assert(
@@ -859,14 +859,14 @@ struct ThreadwiseTensorSliceTransfer_v3r3
    static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
    static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};

-    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
+    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
                                    SrcData,
                                    SrcScalarPerVector,
                                    decltype(src_thread_scratch_desc_),
                                    true>
        src_thread_scratch_;

-    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
+    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
                                    DstData,
                                    DstScalarPerVector,
                                    decltype(dst_thread_scratch_desc_),

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
@@ -13,7 +13,7 @@ namespace ck {
 //   3. src_slice_origin and dst_slice_origin are not known at compile-time,
 //   4. Use thread buffer
 template <typename SliceLengths,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          typename SrcData,
          typename DstData,
          typename SrcDesc,
@@ -76,8 +76,8 @@ struct ThreadwiseTensorSliceTransfer_v5r1
    __device__ void
    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
    {
-        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
-                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");

        static_assert(
@@ -244,8 +244,8 @@ struct ThreadwiseTensorSliceTransfer_v5r1
    __device__ void
    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
    {
-        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
-                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");

        static_assert(
@@ -602,7 +602,7 @@ struct ThreadwiseTensorSliceTransfer_v5r1

    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();

-    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
+    StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;

    SrcCoord src_coord_;
    DstCoord dst_coord_;

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
@@ -29,7 +29,7 @@ template <typename SrcData,
          typename DimAccessOrder,
          index_t VectorDim,
          index_t ScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          bool SrcResetCoordinateAfterRun,
          bool DstResetCoordinateAfterRun>
 struct ThreadwiseTensorSliceTransfer_v6r1

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
@@ -31,7 +31,7 @@ template <typename Src0Data,
          typename DimAccessOrder,
          index_t VectorDim,
          index_t ScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          bool Src0ResetCoordinateAfterRun,
          bool Src1ResetCoordinateAfterRun,
          bool DstResetCoordinateAfterRun>

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
@@ -33,7 +33,7 @@ template <typename Src0Data,
          typename DimAccessOrder,
          index_t VectorDim,
          index_t ScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          bool Src0ResetCoordinateAfterRun,
          bool Src1ResetCoordinateAfterRun,
          bool Src2ResetCoordinateAfterRun,

--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -476,7 +476,7 @@ struct MfmaSelector
    template <>
    static constexpr auto GetMfma<bhalf_t, 32, 32>()
    {
-#if defined(CK_AMD_GPU_GFX90A)
+#if defined(CK_USE_AMD_MFMA_BF16_1K_OP)
        return MfmaInstr::mfma_f32_32x32x8bf16_1k;
 #else
        return MfmaInstr::mfma_f32_32x32x4bf16;
@@ -486,7 +486,7 @@ struct MfmaSelector
    template <>
    static constexpr auto GetMfma<bhalf_t, 16, 16>()
    {
-#if defined(CK_AMD_GPU_GFX90A)
+#if defined(CK_USE_AMD_MFMA_BF16_1K_OP)
        return MfmaInstr::mfma_f32_16x16x16bf16_1k;
 #else
        return MfmaInstr::mfma_f32_16x16x8bf16;

--- a/include/ck/utility/amd_address_space.hpp
+++ b/include/ck/utility/amd_address_space.hpp
@@ -9,7 +9,7 @@

 namespace ck {

-enum struct AddressSpaceEnum_t
+enum struct AddressSpaceEnum
 {
    Generic,
    Global,
@@ -19,7 +19,7 @@ enum struct AddressSpaceEnum_t
 };

 template <typename T>
-__device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p)
+__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p)
 {
    // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
    // only c-style pointer cast seems be able to be compiled
@@ -30,13 +30,13 @@ __device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p)
 }

 template <typename T>
-__host__ __device__ T CONSTANT* cast_pointer_to_constant_address_space(T* p)
+__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p)
 {
    // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
    // only c-style pointer cast seems be able to be compiled
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wold-style-cast"
-    return (T CONSTANT*)p; // NOLINT(old-style-cast)
+    return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
 #pragma clang diagnostic pop
 }


--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
-#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
-#define CK_AMD_BUFFER_ADDRESSING_HPP
-
+#pragma once
 #include "data_type.hpp"

 namespace ck {
@@ -87,6 +85,7 @@ llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
+
 // buffer load fp16
 __device__ half_t
 llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
@@ -212,6 +211,7 @@ llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
+
 // buffer store fp32
 __device__ void
 llvm_amdgcn_raw_buffer_store_fp32(float vdata,
@@ -233,6 +233,7 @@ llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
+
 // buffer atomic-add fp16
 __device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
    half2_t vdata,
@@ -1046,4 +1047,3 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
 }

 } // namespace ck
-#endif
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
-#ifndef CK_COMMON_HEADER_HPP
-#define CK_COMMON_HEADER_HPP
-
+#pragma once
 #include "config.hpp"
 #include "array.hpp"
 #include "container_helper.hpp"
@@ -20,30 +18,29 @@
 #include "number.hpp"
 #include "sequence.hpp"
 #include "sequence_helper.hpp"
-#include "synchronization.hpp"
 #include "tuple.hpp"
 #include "tuple_helper.hpp"
 #include "type.hpp"
 #include "magic_division.hpp"
-#include "utility.hpp"
 #include "c_style_pointer_cast.hpp"
-#include "amd_address_space.hpp"
-#include "amd_buffer_addressing.hpp"
-#include "static_buffer.hpp"
-#include "dynamic_buffer.hpp"
 #include "is_known_at_compile_time.hpp"
 #include "transpose_vectors.hpp"
 #include "inner_product.hpp"
 // #include "element_wise_operation.hpp"
 #include "debug.hpp"

+#include "amd_buffer_addressing.hpp"
+#include "get_id.hpp"
+#include "synchronization.hpp"
+#include "amd_address_space.hpp"
+#include "static_buffer.hpp"
+#include "dynamic_buffer.hpp"
+
 // TODO: remove this
 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"
 #endif

-#if CK_USE_AMD_XDLOPS
+#ifdef CK_USE_AMD_MFMA
 #include "amd_xdlops.hpp"
 #endif
-
-#endif