Merge branch 'develop' into jd/dev_pkg

dd6a8de4 · Jehandad Khan · 0aa899aa · abf4bdb9 · dd6a8de4 · dd6a8de4
Commit dd6a8de4 authored Apr 06, 2022 by Jehandad Khan
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -54,11 +54,85 @@ __global__ void
                                                   block_2_ctile_map);
 }
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename GemmDesc,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          bool HasMainK0BlockLoop,
+          index_t MaxGroupCount>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_gemm_xdlops_v2r3(
+            const StaticallyIndexedArray<GemmDesc, MaxGroupCount> gemm_desc_,
+            const index_t group_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op)
+{
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    const index_t block_id = get_block_1d_id();
+#if 1
+    static_for<0, MaxGroupCount, 1>{}([&](auto i) {
+        if(block_id >= gemm_desc_[i].BlockStart_ && block_id < gemm_desc_[i].BlockEnd_ &&
+           i < group_count)
+        {
+            auto group_id = i;
+            GridwiseGemm::template Run<HasMainK0BlockLoop>(
+                gemm_desc_[group_id].a_ptr,
+                gemm_desc_[group_id].b_ptr,
+                gemm_desc_[group_id].c_ptr,
+                p_shared,
+                gemm_desc_[group_id].a_grid_desc_k0_m_k1_,
+                gemm_desc_[group_id].b_grid_desc_k0_n_k1_,
+                gemm_desc_[group_id].c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                a_element_op,
+                b_element_op,
+                c_element_op,
+                gemm_desc_[group_id].grouped_gemm_block_2_ctile_map_);
+        }
+    });
+#else
+    const auto gemm_desc_ptr = reinterpret_cast<const GemmDesc*>(&gemm_desc_);
+    index_t group_id = 0;
+    static_for<0, MaxGroupCount, 1>{}([&](auto i) {
+        group_id = (block_id >= gemm_desc_[i].BlockStart && block_id < gemm_desc_[i].BlockEnd &&
+                    i < group_count)
+                       ? i
+                       : group_id;
+    });
+    const index_t block_id_grp = block_id - gemm_desc_ptr[group_id].BlockStart;
+    GridwiseGemm::template Run<HasMainK0BlockLoop>(
+        gemm_desc_ptr[group_id].a_ptr,
+        gemm_desc_ptr[group_id].b_ptr,
+        gemm_desc_ptr[group_id].c_ptr,
+        p_shared,
+        gemm_desc_ptr[group_id].a_grid_desc_k0_m_k1_,
+        gemm_desc_ptr[group_id].b_grid_desc_k0_n_k1_,
+        gemm_desc_ptr[group_id].c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+        a_element_op,
+        b_element_op,
+        c_element_op,
+        gemm_desc_ptr[group_id].block_2_ctile_map_,
+        block_id_grp);
+#endif
+}
 template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
          typename AGridDesc_K0_M_K1,
          typename BGridDesc_K0_N_K1,
          typename CGridDesc_M_N,
@@ -352,11 +426,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        const CElementwiseOperation& c_element_op,
        const Block2CTileMap& block_2_ctile_map)
    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize());
        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
@@ -386,7 +460,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              AElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<K0PerBlock, MPerBlock, K1>,
                                              ABlockTransferThreadClusterLengths_K0_M_K1,
                                              ABlockTransferThreadClusterArrangeOrder,
@@ -417,7 +491,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              BElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<K0PerBlock, NPerBlock, K1>,
                                              BBlockTransferThreadClusterLengths_K0_N_K1,
                                              BBlockTransferThreadClusterArrangeOrder,
@@ -469,10 +543,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        constexpr auto a_block_space_size_aligned =
            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
            b_block_desc_k0_n_k1.GetElementSpaceSize());

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -59,7 +59,7 @@ template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
          typename ABK0MK1GridDesc,
          typename BBK0NK1GridDesc,
          typename CMNGridDesc,
@@ -316,11 +316,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
                               const CElementwiseOperation& c_element_op,
                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid, b_b_k0_n_k1_grid_desc.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid, c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc.GetElementSpaceSize());
        const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
@@ -410,7 +410,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              AElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<1, K0PerBlock, MPerBlock, K1>,
                                              ABlockTransferThreadClusterLengths_K0_M_K1,
                                              ABlockTransferThreadClusterArrangeOrder,
@@ -440,7 +440,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              BElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<1, K0PerBlock, NPerBlock, K1>,
                                              BBlockTransferThreadClusterLengths_K0_N_K1,
                                              BBlockTransferThreadClusterArrangeOrder,
@@ -497,9 +497,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
        constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
        constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
        // preload data into LDS

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -61,7 +61,7 @@ template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
          typename AGridDesc_B_K0_M_K1,
          typename BGridDesc_B_K0_N_K1,
          typename CMNGridDesc,
@@ -277,14 +277,14 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
    __host__ __device__ static constexpr auto
    GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
    {
-        constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
+        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
-        constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
+        constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
        return make_naive_tensor_descriptor_packed(
            make_tuple(I1,
-                       Number<CShuffleMRepeatPerShuffle * MWaves * MPerXDL>{},
+                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXDL>{},
                       I1,
-                       Number<CShuffleNRepeatPerShuffle * NWaves * NPerXDL>{}));
+                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
    }
    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
@@ -305,11 +305,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                               const CElementwiseOperation& c_element_op,
                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid, b_b_k0_n_k1_grid_desc.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
        const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
@@ -399,7 +399,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              AElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<1, K0PerBlock, MPerBlock, K1>,
                                              ABlockTransferThreadClusterLengths_K0_M_K1,
                                              ABlockTransferThreadClusterArrangeOrder,
@@ -429,7 +429,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              BElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<1, K0PerBlock, NPerBlock, K1>,
                                              BBlockTransferThreadClusterLengths_K0_N_K1,
                                              BBlockTransferThreadClusterArrangeOrder,
@@ -486,9 +486,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
        constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
        // preload data into LDS
@@ -539,8 +539,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        // output: register to global memory
        {
-            constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
+            constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
-            constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
+            constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
            constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc =
                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
@@ -560,12 +560,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            constexpr auto c_block_desc_mblock_mperblock_nblock_nperblock =
                GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
-            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                static_cast<FloatC*>(p_shared_block),
                c_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
-            static_assert(M1 == MWaves, "");
+            static_assert(M1 == MWave, "");
-            static_assert(N1 == NWaves, "");
+            static_assert(N1 == NWave, "");
            static_assert(M2 * M3 * M4 == MPerXDL, "");
            static_assert(N2 == NPerXDL, "");
@@ -632,7 +632,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
                                                   7,
                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
+                                                   InMemoryDataOperationEnum::Set,
                                                   1,
                                                   true>{
                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
@@ -646,14 +646,15 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                                     n_thread_data_on_block_idx[I2]),
                    ck::tensor_operation::element_wise::PassThrough{}};
+            // LDS to global
            auto c_block_copy_lds_to_global = BlockwiseTensorSliceTransfer_v6r1<
                BlockSize,                  // index_t BlockSize,
                CElementwiseOperation,      // ElementwiseOperation,
                CGlobalMemoryDataOperation, // DstInMemOp,
                Sequence<1,
-                         CShuffleMRepeatPerShuffle * MWaves * MPerXDL,
+                         CShuffleMRepeatPerShuffle * MWave * MPerXDL,
                         1,
-                         CShuffleNRepeatPerShuffle * NWaves * NPerXDL>, // BlockSliceLengths,
+                         CShuffleNRepeatPerShuffle * NWave * NPerXDL>, // BlockSliceLengths,
                CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
                FloatC,               // typename SrcData,
@@ -672,11 +673,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                 c_element_op};
            constexpr auto mxdlperwave_forward_step =
-                make_multi_index(0, CShuffleMRepeatPerShuffle * MWaves * MPerXDL, 0, 0);
+                make_multi_index(0, CShuffleMRepeatPerShuffle * MWave * MPerXDL, 0, 0);
            constexpr auto nxdlperwave_forward_step =
-                make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWaves * NPerXDL);
+                make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWave * NPerXDL);
            constexpr auto nxdlperwave_backward_step =
-                make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWaves * NPerXDL);
+                make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWave * NPerXDL);
            static_for<0, MRepeat, CShuffleMRepeatPerShuffle>{}([&](auto mxdlperwave_iter) {
                constexpr auto mxdlperwave = mxdlperwave_iter;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -10,6 +10,7 @@
 #include "blockwise_tensor_slice_transfer_v6r1.hpp"
 #include "threadwise_tensor_slice_transfer.hpp"
 #include "gridwise_gemm_pipeline_v1.hpp"
+#include "tensor_space_filling_curve.hpp"
 namespace ck {
@@ -63,7 +64,7 @@ template <
    typename FloatAcc,
    typename FloatCShuffle,
    typename FloatC,
-    InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+    InMemoryDataOperationEnum CGlobalMemoryDataOperation,
    typename AGridDesc_AK0_M_AK1,
    typename BGridDesc_BK0_N_BK1,
    typename CGridDesc_M_N,
@@ -368,11 +369,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
        const CElementwiseOperation& c_element_op,
        const Block2CTileMap& block_2_ctile_map)
    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid,
            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
@@ -402,7 +403,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              AElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<AK0, MPerBlock, AK1>,
                                              ABlockTransferThreadClusterLengths_AK0_M_AK1,
                                              ABlockTransferThreadClusterArrangeOrder,
@@ -433,7 +434,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              BElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<BK0, NPerBlock, BK1>,
                                              BBlockTransferThreadClusterLengths_BK0_N_BK1,
                                              BBlockTransferThreadClusterArrangeOrder,
@@ -487,10 +488,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
@@ -566,7 +567,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
            constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
                GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();
-            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                static_cast<FloatCShuffle*>(p_shared),
                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                    .GetElementSpaceSize());
@@ -643,7 +644,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
                                                   7,
                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
+                                                   InMemoryDataOperationEnum::Set,
                                                   1,
                                                   true>{
                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
@@ -657,6 +658,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
                                     n_thread_data_on_block_idx[I2]),
                    ck::tensor_operation::element_wise::PassThrough{}};
+            // LDS to global
            auto c_block_copy_lds_to_global = BlockwiseTensorSliceTransfer_v6r1<
                BlockSize,                  // index_t BlockSize,
                CElementwiseOperation,      // ElementwiseOperation,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -68,7 +68,7 @@ template <
    typename FloatAB,
    typename FloatAcc,
    typename FloatC,
-    InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+    InMemoryDataOperationEnum CGlobalMemoryDataOperation,
    typename AGridDesc_K0_M_K1,
    typename BGridDesc_K0_N_K1,
    typename CGridDesc_M_N,
@@ -382,15 +382,15 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
        const CElementwiseOperation& c_element_op,
        const Block2CTileMap& block_2_ctile_map)
    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid,
            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
-        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c0_grid,
            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
@@ -422,7 +422,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              AElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<K0PerBlock, MPerBlock, K1>,
                                              ABlockTransferThreadClusterLengths_K0_M_K1,
                                              ABlockTransferThreadClusterArrangeOrder,
@@ -453,7 +453,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              BElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<K0PerBlock, NPerBlock, K1>,
                                              BBlockTransferThreadClusterLengths_K0_N_K1,
                                              BBlockTransferThreadClusterArrangeOrder,
@@ -505,10 +505,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
        constexpr auto a_block_space_size_aligned =
            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
            b_block_desc_k0_n_k1.GetElementSpaceSize());
@@ -582,7 +582,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
            constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
                GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();
-            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                static_cast<FloatC*>(p_shared),
                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                    .GetElementSpaceSize());
@@ -661,7 +661,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
                                                   7,
                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
+                                                   InMemoryDataOperationEnum::Set,
                                                   1,
                                                   true>{
                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -74,7 +74,7 @@ template <
    typename FloatAB,
    typename FloatAcc,
    typename FloatC,
-    InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+    InMemoryDataOperationEnum CGlobalMemoryDataOperation,
    typename AGridDesc_K0_M_K1,
    typename BGridDesc_K0_N_K1,
    typename CGridDesc_M_N,
@@ -397,19 +397,19 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
        const CElementwiseOperation& c_element_op,
        const Block2CTileMap& block_2_ctile_map)
    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid,
            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
-        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c0_grid,
            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
-        auto c1_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+        auto c1_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c1_grid,
            c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                .GetElementSpaceSize());
@@ -441,7 +441,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              AElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<K0PerBlock, MPerBlock, K1>,
                                              ABlockTransferThreadClusterLengths_K0_M_K1,
                                              ABlockTransferThreadClusterArrangeOrder,
@@ -471,7 +471,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
            BlockwiseTensorSliceTransfer_v4r1<BlockSize,
                                              BElementwiseOperation,
                                              ck::tensor_operation::element_wise::PassThrough,
-                                              InMemoryDataOperationEnum_t::Set,
+                                              InMemoryDataOperationEnum::Set,
                                              Sequence<K0PerBlock, NPerBlock, K1>,
                                              BBlockTransferThreadClusterLengths_K0_N_K1,
                                              BBlockTransferThreadClusterArrangeOrder,
@@ -522,10 +522,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
        constexpr auto a_block_space_size_aligned =
            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
            b_block_desc_k0_n_k1.GetElementSpaceSize());
@@ -599,7 +599,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
            constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
                GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();
-            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                static_cast<FloatC*>(p_shared),
                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                    .GetElementSpaceSize());
@@ -678,7 +678,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
                                                   7,
                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
+                                                   InMemoryDataOperationEnum::Set,
                                                   1,
                                                   true>{
                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
@@ -45,13 +45,13 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe
    const index_t thread_global_id = block_global_id * BlockSize + thread_local_id;
-    StaticBuffer<AddressSpaceEnum_t::Vgpr, DataType, 1, true> value_buf;
+    StaticBuffer<AddressSpaceEnum::Vgpr, DataType, 1, true> value_buf;
    value_buf(I0) = value;
    constexpr auto val_buff_desc = make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-    auto global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+    auto global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
        p_global, grid_1d_buffer_desc.GetElementSpaceSize());
    if(thread_global_id < grid_1d_buffer_desc.GetElementSize())
@@ -65,7 +65,7 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe
                                                                   Sequence<0>,
                                                                   0,
                                                                   1,
-                                                                   InMemoryDataOperationEnum_t::Set,
+                                                                   InMemoryDataOperationEnum::Set,
                                                                   1,
                                                                   true>(
            grid_1d_buffer_desc, make_multi_index(thread_global_id), PassThroughOp{});

--- a/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
+#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
+#include "reduction_functions_accumulate.hpp"
+namespace ck {
+// Assume
+//  1) SrcDesc is known at compile-time
+//  2) DstDesc is known at compile-time
+//  3) SrcBuffer is static buffer
+//  4) DstBuffer is static buffer
+template <typename AccDataType,
+          typename SrcThreadDesc_M_K,
+          typename DstThreadDesc_M,
+          typename OpReduce,
+          bool PropagateNan>
+struct ThreadwiseReduction
+{
+    static constexpr auto src_thread_desc_m_k = SrcThreadDesc_M_K{};
+    static constexpr auto dst_thread_desc_m   = DstThreadDesc_M{};
+    static constexpr auto src_length_m = src_thread_desc_m_k.GetLength(Number<0>{});
+    static constexpr auto src_length_k = src_thread_desc_m_k.GetLength(Number<1>{});
+    static constexpr auto dst_length_m = dst_thread_desc_m.GetLength(Number<0>{});
+    static_assert(src_length_m == dst_length_m, "lengths of source and dst buffer must match!");
+    using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>;
+    template <typename SrcBufferType, typename DstBufferType>
+    __device__ static void Reduce(const SrcBufferType& src_buf, DstBufferType& dst_buf)
+    {
+        static_for<0, src_length_m, 1>{}([&](auto iM) {
+            constexpr index_t out_offset = dst_thread_desc_m.CalculateOffset(make_tuple(iM));
+            static_for<0, src_length_k, 1>{}([&](auto iK) {
+                constexpr auto offset = src_thread_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+                Accumulation::Calculate(dst_buf(Number<out_offset>{}), src_buf[Number<offset>{}]);
+            });
+        });
+    };
+};
+// Assume
+//  1) SrcDesc is known at compile-time
+//  2) DstDesc is known at compile-time
+//  3) SrcBuffer is static buffer
+//  4) DstBuffer is static buffer
+template <typename AccDataType,
+          typename IndexDataType,
+          typename SrcThreadDesc_M_K,
+          typename DstThreadDesc_M,
+          typename OpReduce,
+          bool PropagateNan>
+struct ThreadwiseReductionWithIndex
+{
+    static constexpr auto src_thread_desc_m_k = SrcThreadDesc_M_K{};
+    static constexpr auto dst_thread_desc_m   = DstThreadDesc_M{};
+    static constexpr auto src_length_m = src_thread_desc_m_k.GetLength(Number<0>{});
+    static constexpr auto src_length_k = src_thread_desc_m_k.GetLength(Number<1>{});
+    static constexpr auto dst_length_m = dst_thread_desc_m.GetLength(Number<0>{});
+    static_assert(src_length_m == dst_length_m, "lengths of source and dst buffer must match!");
+    using Accumulation =
+        detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>;
+    template <typename SrcValueBufferType,
+              typename SrcIndexBufferType,
+              typename DstValueBufferType,
+              typename DstIndexBufferType>
+    __device__ static void Reduce(const SrcValueBufferType& src_val_buf,
+                                  const SrcIndexBufferType& src_idx_buf,
+                                  DstValueBufferType& dst_val_buf,
+                                  DstIndexBufferType& dst_idx_buf)
+    {
+        static_for<0, src_length_m, 1>{}([&](auto iM) {
+            constexpr index_t out_offset = dst_thread_desc_m.CalculateOffset(make_tuple(iM));
+            static_for<0, src_length_k, 1>{}([&](auto iK) {
+                constexpr auto offset = src_thread_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+                Accumulation::Calculate(dst_val_buf(Number<out_offset>{}),
+                                        src_val_buf[Number<offset>{}],
+                                        dst_idx_buf(Number<out_offset>{}),
+                                        src_idx_buf[Number<offset>{}]);
+            });
+        });
+    };
+};
+}; // end of namespace ck
+#endif
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -4,6 +4,7 @@
 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
+#include "tensor_space_filling_curve.hpp"
 namespace ck {
@@ -55,7 +56,7 @@ template <typename SrcData,
          typename DimAccessOrder,
          index_t DstVectorDim,
          index_t DstScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          index_t DstScalarStrideInVector,
          bool DstResetCoordinateAfterRun,
          typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
@@ -78,6 +79,8 @@ struct ThreadwiseTensorSliceTransfer_v1r3
    {
        static_assert(SrcDesc::IsKnownAtCompileTime(),
                      "wrong! SrcDesc need to known at compile-time");
+        static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
+                      "wrong! Not divisible");
    }
    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
@@ -85,16 +88,12 @@ struct ThreadwiseTensorSliceTransfer_v1r3
        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
    }
-    template <typename SrcSliceOriginIdx,
+    template <typename SrcSliceOriginIdx, typename SrcBuffer, typename DstBuffer>
-              typename SrcBuffer,
-              typename DstBuffer,
-              typename DstStepHacks>
    __device__ void Run(const SrcDesc&,
                        const SrcSliceOriginIdx&,
                        const SrcBuffer& src_buf,
                        const DstDesc& dst_desc,
-                        DstBuffer& dst_buf,
+                        DstBuffer& dst_buf)
-                        const DstStepHacks& dst_step_hacks)
    {
        static_assert(SrcDesc::IsKnownAtCompileTime(),
                      "wrong! SrcDesc need to known at compile-time");
@@ -108,9 +107,6 @@ struct ThreadwiseTensorSliceTransfer_v1r3
        constexpr auto src_desc             = remove_cvref_t<SrcDesc>{};
        constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
        // scalar per access on each dim
        // TODO: don't use lambda_scalar_per_access
        constexpr auto dst_scalar_per_access = generate_sequence(
@@ -119,85 +115,26 @@ struct ThreadwiseTensorSliceTransfer_v1r3
        constexpr auto dst_scalar_step_in_vector =
            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
-        constexpr auto dim_access_order = DimAccessOrder{};
+                                                    remove_cv_t<decltype(dst_scalar_per_access)>>;
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        // make forward steps
-        const auto dst_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-        // make backward steps
-        const auto dst_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-        // loop over tensor and copy
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-                forward_sweep_(I0) = true;
-                static_for<1, nDim, 1>{}([&](auto i) {
+        // TODO: Use SpaceFillingCurve::ScalarsPerAccess instread of DstScalarPerVector?
-                    index_t tmp = ordered_access_idx[I0];
+        static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
+                      "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
+        typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
+        using dst_vector_t = typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
-                    static_for<1, i, 1>{}([&](auto j) {
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-                    forward_sweep_(i) = tmp % 2 == 0;
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
-                });
+            constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
-                return forward_sweep_;
-            }();
-            // calculate dst data index
-            constexpr auto dst_data_idx = [&]() {
-                Index ordered_idx;
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i]
-                                         ? ordered_access_idx[i]
-                                         : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
-                });
-                return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                       dst_scalar_per_access;
-            }();
-            typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
-            using dst_vector_t =
-                typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
            // copy data from src_buf into dst_vector
+            // TODO: It's a hack here to use \p dst_scalar_step_in_vector. Use SpaceFillingCurve?
            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
                constexpr index_t src_offset = src_desc.CalculateOffset(
-                    src_slice_origin_idx + dst_data_idx + i * dst_scalar_step_in_vector);
+                    src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
                SrcData dst_v;
@@ -212,69 +149,18 @@ struct ThreadwiseTensorSliceTransfer_v1r3
                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
            // copy data from dst_vector into dst_buf
-            if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
-            {
+                dst_coord_.GetOffset(),
-                dst_buf.template Set<dst_vector_t>(
+                is_dst_valid,
-                    dst_coord_.GetOffset(),
+                dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
-            {
-                dst_buf.template AtomicAdd<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add)
-            {
-                typename vector_type_maker<DstData, DstScalarPerVector>::type tmp;
-                tmp.template AsType<dst_vector_t>()(Number<0>{}) =
-                    dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid);
-                static_for<0, DstScalarPerVector, 1>{}([&](auto t) {
-                    dst_vector.template AsType<DstData>()(t) += tmp.template AsType<DstData>()[t];
-                });
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            constexpr auto move_on_dim = [&]() constexpr
+            if constexpr(idx_1d.value != num_access - 1)
            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
-                    });
-                });
-                return move_on_dim_;
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
            }
-            ();
-            // move
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
        });
        // move dst coordinate back to slice origin (or not)
@@ -287,82 +173,27 @@ struct ThreadwiseTensorSliceTransfer_v1r3
        }
    }
-    template <typename SrcSliceOriginIdx, typename SrcBuffer, typename DstBuffer>
-    __device__ void Run(const SrcDesc&,
-                        const SrcSliceOriginIdx&,
-                        const SrcBuffer& src_buf,
-                        const DstDesc& dst_desc,
-                        DstBuffer& dst_buf)
-    {
-        constexpr index_t ntransform_dst = remove_cvref_t<DstDesc>::GetNumOfTransform();
-        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
-        constexpr auto dst_step_hacks =
-            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
-                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
-        Run(SrcDesc{}, SrcSliceOriginIdx{}, src_buf, dst_desc, dst_buf, dst_step_hacks);
-    }
    __device__ static constexpr auto GetDstCoordinateResetStep()
    {
-        constexpr auto I0 = Number<0>{};
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
        constexpr auto dst_scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
-        constexpr auto dim_access_order = DimAccessOrder{};
+                                                    remove_cv_t<decltype(dst_scalar_per_access)>>;
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        // judge move forward or move backward during the last iteration
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
-        constexpr auto forward_sweep = [&]() {
+        if constexpr(num_access == 0)
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+        {
+            return typename SpaceFillingCurve::Index{};
-            forward_sweep_(I0) = true;
+        }
+        else
-            static_for<1, nDim, 1>{}([&](auto i) {
+        {
-                index_t tmp = ordered_access_lengths[I0] - 1;
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
-                });
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-            return forward_sweep_;
-        }();
-        // calculate dst data index after last iteration in Run(), if it has not being reset by
-        // RunWrite()
-        constexpr auto dst_data_idx = [&]() {
-            Index ordered_idx;
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   dst_scalar_per_access;
-        }();
-        //
-        constexpr auto reset_dst_data_step = [&]() {
-            Index reset_dst_data_step_;
-            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
-            return reset_dst_data_step_;
-        }();
-        return reset_dst_data_step;
+            return reset_step;
+        }
    }
    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
@@ -383,7 +214,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3
    private:
    DstCoord dst_coord_;
    const DstElementwiseOperation dst_element_op_;
-}; // namespace ck
+}; // namespace ThreadwiseTensorSliceTransfer_v1r3
 // Assume:
 //   1. src:
@@ -421,6 +252,8 @@ struct ThreadwiseTensorSliceTransfer_v2
    {
        static_assert(DstDesc::IsKnownAtCompileTime(),
                      "wrong! SrcDesc need to known at compile-time");
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! Not divisible");
    }
    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
@@ -428,16 +261,12 @@ struct ThreadwiseTensorSliceTransfer_v2
        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
    }
-    template <typename SrcBuffer,
+    template <typename SrcBuffer, typename DstBuffer, typename DstSliceOriginIdx>
-              typename DstBuffer,
-              typename DstSliceOriginIdx,
-              typename SrcStepHacks>
    __device__ void Run(const SrcDesc& src_desc,
                        const SrcBuffer& src_buf,
                        const DstDesc&,
                        const DstSliceOriginIdx&,
-                        DstBuffer& dst_buf,
+                        DstBuffer& dst_buf)
-                        const SrcStepHacks& src_step_hacks)
    {
        static_assert(DstDesc::IsKnownAtCompileTime(),
                      "wrong! DstDesc need to known at compile-time");
@@ -453,9 +282,6 @@ struct ThreadwiseTensorSliceTransfer_v2
        constexpr auto dst_desc             = remove_cvref_t<DstDesc>{};
        constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{};
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
        // scalar per access on each dim
        // TODO: don't use lambda_scalar_per_access
        constexpr auto src_scalar_per_access = generate_sequence(
@@ -464,80 +290,19 @@ struct ThreadwiseTensorSliceTransfer_v2
        constexpr auto src_scalar_step_in_vector =
            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
-        constexpr auto dim_access_order = DimAccessOrder{};
+                                                    remove_cv_t<decltype(src_scalar_per_access)>>;
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        // make forward steps
-        const auto src_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-        // make backward steps
-        const auto src_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
        // loop over tensor and copy
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-                forward_sweep_(I0) = true;
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_access_idx[I0];
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-                return forward_sweep_;
-            }();
-            // calculate src data index
-            constexpr auto src_data_idx = [&]() {
-                Index ordered_idx;
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i]
-                                         ? ordered_access_idx[i]
-                                         : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
-                });
-                return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                       src_scalar_per_access;
-            }();
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
            typename vector_type_maker<SrcData, SrcScalarPerVector>::type src_vector;
            using src_vector_t =
                typename vector_type_maker<SrcData, SrcScalarPerVector>::type::type;
+            constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d);
            const bool is_src_valid =
                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
@@ -552,41 +317,17 @@ struct ThreadwiseTensorSliceTransfer_v2
                    dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
                                             i * src_scalar_step_in_vector);
-                dst_buf(Number<dst_offset>{}) = src_vector.template AsType<SrcData>()[i];
+                dst_buf(Number<dst_offset>{}) =
+                    type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
            });
-            constexpr auto move_on_dim = [&]() constexpr
+            if constexpr(idx_1d.value != num_access - 1)
            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
-                    });
-                });
-                return move_on_dim_;
+                move_tensor_coordinate(
+                    src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
            }
-            ();
-            // move
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
        });
        // move src coordinate back to slice origin (or not)
@@ -599,82 +340,27 @@ struct ThreadwiseTensorSliceTransfer_v2
        }
    }
-    template <typename SrcBuffer, typename DstBuffer, typename DstSliceOriginIdx>
-    __device__ void Run(const SrcDesc& src_desc,
-                        const SrcBuffer& src_buf,
-                        const DstDesc&,
-                        const DstSliceOriginIdx&,
-                        DstBuffer& dst_buf)
-    {
-        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
-        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
-        constexpr auto src_step_hacks =
-            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
-                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
-        Run(src_desc, src_buf, DstDesc{}, DstSliceOriginIdx{}, dst_buf, src_step_hacks);
-    }
    __device__ static constexpr auto GetSrcCoordinateResetStep()
    {
-        constexpr auto I0 = Number<0>{};
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
        constexpr auto src_scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
-        constexpr auto dim_access_order = DimAccessOrder{};
+                                                    remove_cv_t<decltype(src_scalar_per_access)>>;
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-            forward_sweep_(I0) = true;
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+        if constexpr(num_access == 0)
-            static_for<1, nDim, 1>{}([&](auto i) {
+        {
-                index_t tmp = ordered_access_lengths[I0] - 1;
+            return typename SpaceFillingCurve::Index{};
+        }
-                static_for<1, i, 1>{}([&](auto j) {
+        else
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
+        {
-                });
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-            return forward_sweep_;
-        }();
-        // calculate src data index after last iteration in Run(), if it has not being reset by
-        // RunWrite()
-        constexpr auto src_data_idx = [&]() {
-            Index ordered_idx;
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   src_scalar_per_access;
-        }();
-        //
-        constexpr auto reset_src_data_step = [&]() {
-            Index reset_src_data_step_;
-            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
-            return reset_src_data_step_;
-        }();
-        return reset_src_data_step;
+            return reset_step;
+        }
    }
    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
@@ -721,7 +407,7 @@ struct ThreadwiseTensorSliceTransfer_v2
 //   3. src_slice_origin and dst_slice_origin are not known at compile-time,
 //   4. Use thread buffer
 template <typename SliceLengths,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          typename SrcData,
          typename DstData,
          typename SrcDesc,
@@ -758,6 +444,10 @@ struct ThreadwiseTensorSliceTransfer_v3
        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
    {
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! Not divisible");
+        static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
+                      "wrong! Not divisible");
    }
    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
@@ -774,8 +464,8 @@ struct ThreadwiseTensorSliceTransfer_v3
    __device__ void
    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
    {
-        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");
        static_assert(
@@ -931,8 +621,8 @@ struct ThreadwiseTensorSliceTransfer_v3
    __device__ void
    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
    {
-        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");
        static_assert(
@@ -1289,7 +979,7 @@ struct ThreadwiseTensorSliceTransfer_v3
    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
-    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
+    StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
    SrcCoord src_coord_;
    DstCoord dst_coord_;
@@ -1335,7 +1025,8 @@ struct ThreadwiseTensorSliceTransfer_v4
        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
                      "wrong! SrcDesc and DstDesc need to known at compile-time");
-        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0, "wrong!");
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! Not divisible");
    }
    template <typename SrcRefToOriginDisplacement,

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r4.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r4.hpp
-#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R4_HPP
-#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R4_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-namespace ck {
-// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
-// and sometimes useless instructions:
-//   1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
-//   instead
-//   2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
-//   tensor coordinate instead
-//   3. Don't use a pointer to VGPR buffer, use vector instead
-// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-// TODO: fix this
-// Assume:
-//   1. src:
-//     1. SrcDesc is known at compile-time
-//     2. SrcBuffer is StaticBuffer
-//     3. SrcSliceOrginIdx is known at compile-time
-//   2. dst:
-//     1. DstDesc is not known at compile-time
-//     2. DstBuffer is DynamicBuffer
-//     3. DstSliceOrginIdx is not known at compile time
-template <typename SrcData,
-          typename DstData,
-          typename SrcDesc,
-          typename DstDesc,
-          typename Dst0Desc, // this is really one of sources, but it has same shape as DstDesc
-          typename Dst1Desc, // this is really one of sources, but it has same shape as DstDesc
-          typename DstElementwiseOperation,
-          typename SliceLengths,
-          typename DimAccessOrder,
-          index_t DstVectorDim,
-          index_t DstScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
-          index_t DstScalarStrideInVector,
-          bool DstResetCoordinateAfterRun,
-          typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
-struct ThreadwiseTensorSliceTransfer_v1r4
-{
-    static constexpr index_t nDim = SliceLengths::Size();
-    using Index = MultiIndex<nDim>;
-    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
-    using Dst0Coord = decltype(make_tensor_coordinate(Dst0Desc{}, Index{}));
-    using Dst1Coord = decltype(make_tensor_coordinate(Dst1Desc{}, Index{}));
-    using DstCoordStep  = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
-    using Dst0CoordStep = decltype(make_tensor_coordinate_step(Dst0Desc{}, Index{}));
-    using Dst1CoordStep = decltype(make_tensor_coordinate_step(Dst1Desc{}, Index{}));
-    __device__ constexpr ThreadwiseTensorSliceTransfer_v1r4(
-        const DstDesc& dst_desc,
-        const Dst0Desc& dst0_desc,
-        const Dst1Desc& dst1_desc,
-        const Index& dst_slice_origin_idx,
-        const DstElementwiseOperation& dst_element_op)
-        : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)),
-          dst0_coord_(make_tensor_coordinate(dst0_desc, dst_slice_origin_idx)),
-          dst1_coord_(make_tensor_coordinate(dst1_desc, dst_slice_origin_idx)),
-          dst_element_op_{dst_element_op}
-    {
-        static_assert(SrcDesc::IsKnownAtCompileTime(),
-                      "wrong! SrcDesc need to known at compile-time");
-    }
-    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
-    {
-        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
-    }
-    template <typename SrcSliceOriginIdx,
-              typename SrcBuffer,
-              typename DstBuffer,
-              typename Dst0Buffer,
-              typename Dst1Buffer,
-              typename DstStepHacks,
-              typename Dst0StepHacks,
-              typename Dst1StepHacks>
-    __device__ void Run(const SrcDesc&,
-                        const SrcSliceOriginIdx&,
-                        const SrcBuffer& src_buf,
-                        const DstDesc& dst_desc,
-                        DstBuffer& dst_buf,
-                        const DstStepHacks& dst_step_hacks,
-                        const Dst0Desc& dst0_desc,
-                        const Dst0Buffer& dst0_buf,
-                        const Dst0StepHacks& dst0_step_hacks,
-                        const Dst1Desc& dst1_desc,
-                        const Dst1Buffer& dst1_buf,
-                        const Dst1StepHacks& dst1_step_hacks)
-    {
-        static_assert(SrcDesc::IsKnownAtCompileTime(),
-                      "wrong! SrcDesc need to known at compile-time");
-        static_assert(is_known_at_compile_time<remove_cvref_t<SrcSliceOriginIdx>>::value,
-                      "wrong! SrcSliceOrigin need to known at compile-time");
-        static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
-        // SrcDesc and src_slice_origin_idx are known at compile-time
-        constexpr auto src_desc             = remove_cvref_t<SrcDesc>{};
-        constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
-        constexpr auto dim_access_order = DimAccessOrder{};
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        // make forward steps: dst
-        const auto dst_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-        // make forward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst0_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst0_desc, forward_step_idx, dst0_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-        // make forward steps: dst1
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst1_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst1_desc, forward_step_idx, dst1_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-        // make backward steps: dst
-        const auto dst_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-        // make backward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst0_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst0_desc, backward_step_idx, dst0_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-        // make backward steps: dst1
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst1_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst1_desc, backward_step_idx, dst1_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-        // loop over tensor and copy
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-                forward_sweep_(I0) = true;
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_access_idx[I0];
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-                return forward_sweep_;
-            }();
-            // calculate dst data index
-            constexpr auto dst_data_idx = [&]() {
-                Index ordered_idx;
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i]
-                                         ? ordered_access_idx[i]
-                                         : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
-                });
-                return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                       dst_scalar_per_access;
-            }();
-            typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
-            using dst_vector_t =
-                typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
-            // load dst0 and dst1 and apply elementwise operation
-            {
-                // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-                // TODO: fix this
-                static_assert(DstScalarPerVector == 1, "wrong!");
-                // copy data from src_buf into dst_vector_src_data
-                constexpr index_t src_offset =
-                    src_desc.CalculateOffset(src_slice_origin_idx + dst_data_idx);
-                const SrcData src_v = src_buf[Number<src_offset>{}];
-                // load dst0 and dst1
-                const bool is_dst0_valid =
-                    coordinate_has_valid_offset_assuming_visible_index_is_valid(dst0_desc,
-                                                                                dst0_coord_);
-                const bool is_dst1_valid =
-                    coordinate_has_valid_offset_assuming_visible_index_is_valid(dst1_desc,
-                                                                                dst1_coord_);
-                const DstData dst0_v =
-                    dst0_buf.template Get<DstData>(dst0_coord_.GetOffset(), is_dst0_valid);
-                const DstData dst1_v =
-                    dst1_buf.template Get<DstData>(dst1_coord_.GetOffset(), is_dst1_valid);
-#if !CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE
-                // apply element-wise operation in SrcData type
-                const SrcData dst_v = dst_element_op_(
-                    src_v, type_convert<SrcData>(dst0_v), type_convert<SrcData>(dst1_v));
-                // apply type convert
-                dst_vector.template AsType<DstData>()(Number<0>{}) = type_convert<DstData>(dst_v);
-#else
-                // apply element-wise operation in DstData type
-                DstData dst_v;
-                dst_element_op_(dst_v, src_v, dst0_v, dst1_v);
-                dst_vector.template AsType<DstData>()(Number<0>{}) = dst_v;
-#endif
-            }
-            const bool is_dst_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
-            // copy data from dst_vector into dst_buf
-            if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
-            {
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
-            {
-                dst_buf.template AtomicAdd<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add)
-            {
-                typename vector_type_maker<DstData, DstScalarPerVector>::type tmp;
-                tmp.template AsType<dst_vector_t>()(Number<0>{}) =
-                    dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid);
-                static_for<0, DstScalarPerVector, 1>{}([&](auto t) {
-                    dst_vector.template AsType<DstData>()(t) += tmp.template AsType<DstData>()[t];
-                });
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            constexpr auto move_on_dim = [&]() constexpr
-            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
-                    });
-                });
-                return move_on_dim_;
-            }
-            ();
-            // move
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
-                        // dst0
-                        move_tensor_coordinate(
-                            dst0_desc, dst0_coord_, dst0_forward_steps[dim_access_order[i]]);
-                        // dst1
-                        move_tensor_coordinate(
-                            dst1_desc, dst1_coord_, dst1_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
-                        // dst0
-                        move_tensor_coordinate(
-                            dst0_desc, dst0_coord_, dst0_backward_steps[dim_access_order[i]]);
-                        // dst1
-                        move_tensor_coordinate(
-                            dst1_desc, dst1_coord_, dst1_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
-        });
-        // move dst coordinate back to slice origin (or not)
-        if constexpr(DstResetCoordinateAfterRun)
-        {
-            const auto dst_reset_step =
-                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
-        }
-    }
-    template <typename SrcSliceOriginIdx,
-              typename SrcBuffer,
-              typename DstBuffer,
-              typename Dst0Buffer,
-              typename Dst1Buffer>
-    __device__ void Run(const SrcDesc&,
-                        const SrcSliceOriginIdx&,
-                        const SrcBuffer& src_buf,
-                        const DstDesc& dst_desc,
-                        DstBuffer& dst_buf,
-                        const Dst0Desc& dst0_desc,
-                        const Dst0Buffer& dst0_buf,
-                        const Dst1Desc& dst1_desc,
-                        const Dst1Buffer& dst1_buf)
-    {
-        auto f_step_hacks = [&](auto desc) {
-            constexpr index_t ntransform = decltype(desc)::GetNumOfTransform();
-            constexpr auto zeros = typename uniform_sequence_gen<ntransform, 0>::type{};
-            constexpr auto step_hacks =
-                make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
-                           generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
-            return step_hacks;
-        };
-        Run(SrcDesc{},
-            SrcSliceOriginIdx{},
-            src_buf,
-            dst_desc,
-            dst_buf,
-            f_step_hacks(dst_desc),
-            dst0_desc,
-            dst0_buf,
-            f_step_hacks(dst0_desc),
-            dst1_desc,
-            dst1_buf,
-            f_step_hacks(dst1_desc));
-    }
-    __device__ static constexpr auto GetDstCoordinateResetStep()
-    {
-        constexpr auto I0 = Number<0>{};
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
-        constexpr auto dim_access_order = DimAccessOrder{};
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-            forward_sweep_(I0) = true;
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_access_lengths[I0] - 1;
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
-                });
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-            return forward_sweep_;
-        }();
-        // calculate dst data index after last iteration in Run(), if it has not being reset by
-        // RunWrite()
-        constexpr auto dst_data_idx = [&]() {
-            Index ordered_idx;
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   dst_scalar_per_access;
-        }();
-        //
-        constexpr auto reset_dst_data_step = [&]() {
-            Index reset_dst_data_step_;
-            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
-            return reset_dst_data_step_;
-        }();
-        return reset_dst_data_step;
-    }
-    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
-                                       const Index& dst_slice_origin_step_idx)
-    {
-        // if dst coord was not reset by Run(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
-                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
-        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
-    }
-    private:
-    DstCoord dst_coord_;
-    Dst0Coord dst0_coord_;
-    Dst1Coord dst1_coord_;
-    const DstElementwiseOperation dst_element_op_;
-}; // namespace ck
-} // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r5.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r5.hpp
-#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R5_HPP
-#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R5_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-namespace ck {
-// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
-// and sometimes useless instructions:
-//   1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
-//   instead
-//   2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
-//   tensor coordinate instead
-//   3. Don't use a pointer to VGPR buffer, use vector instead
-// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-// TODO: fix this
-// Assume:
-//   1. src:
-//     1. SrcDesc is known at compile-time
-//     2. SrcBuffer is StaticBuffer
-//     3. SrcSliceOrginIdx is known at compile-time
-//   2. dst:
-//     1. DstDesc is not known at compile-time
-//     2. DstBuffer is DynamicBuffer
-//     3. DstSliceOrginIdx is not known at compile time
-template <typename SrcData,
-          typename DstData,
-          typename SrcDesc,
-          typename DstDesc,
-          typename Dst0Desc, // this is really one of sources, but it has same shape as DstDesc
-          typename DstElementwiseOperation,
-          typename SliceLengths,
-          typename DimAccessOrder,
-          index_t DstVectorDim,
-          index_t DstScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
-          index_t DstScalarStrideInVector,
-          bool DstResetCoordinateAfterRun,
-          typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
-struct ThreadwiseTensorSliceTransfer_v1r5
-{
-    static constexpr index_t nDim = SliceLengths::Size();
-    using Index = MultiIndex<nDim>;
-    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
-    using Dst0Coord = decltype(make_tensor_coordinate(Dst0Desc{}, Index{}));
-    using DstCoordStep  = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
-    using Dst0CoordStep = decltype(make_tensor_coordinate_step(Dst0Desc{}, Index{}));
-    __device__ constexpr ThreadwiseTensorSliceTransfer_v1r5(
-        const DstDesc& dst_desc,
-        const Dst0Desc& dst0_desc,
-        const Index& dst_slice_origin_idx,
-        const DstElementwiseOperation& dst_element_op)
-        : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)),
-          dst0_coord_(make_tensor_coordinate(dst0_desc, dst_slice_origin_idx)),
-          dst_element_op_{dst_element_op}
-    {
-        static_assert(SrcDesc::IsKnownAtCompileTime(),
-                      "wrong! SrcDesc need to known at compile-time");
-    }
-    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
-    {
-        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
-    }
-    template <typename SrcSliceOriginIdx,
-              typename SrcBuffer,
-              typename DstBuffer,
-              typename Dst0Buffer,
-              typename DstStepHacks,
-              typename Dst0StepHacks>
-    __device__ void Run(const SrcDesc&,
-                        const SrcSliceOriginIdx&,
-                        const SrcBuffer& src_buf,
-                        const DstDesc& dst_desc,
-                        DstBuffer& dst_buf,
-                        const DstStepHacks& dst_step_hacks,
-                        const Dst0Desc& dst0_desc,
-                        const Dst0Buffer& dst0_buf,
-                        const Dst0StepHacks& dst0_step_hacks)
-    {
-        static_assert(SrcDesc::IsKnownAtCompileTime(),
-                      "wrong! SrcDesc need to known at compile-time");
-        static_assert(is_known_at_compile_time<remove_cvref_t<SrcSliceOriginIdx>>::value,
-                      "wrong! SrcSliceOrigin need to known at compile-time");
-        static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
-        // SrcDesc and src_slice_origin_idx are known at compile-time
-        constexpr auto src_desc             = remove_cvref_t<SrcDesc>{};
-        constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
-        constexpr auto dim_access_order = DimAccessOrder{};
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        // make forward steps: dst
-        const auto dst_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-        // make forward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-        // TODO: fix this
-        const auto dst0_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst0_desc, forward_step_idx, dst0_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-        // make backward steps: dst
-        const auto dst_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-        // make backward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-        // TODO: fix this
-        const auto dst0_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-                return make_tensor_coordinate_step(
-                    dst0_desc, backward_step_idx, dst0_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-        // loop over tensor and copy
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-                forward_sweep_(I0) = true;
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_access_idx[I0];
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-                return forward_sweep_;
-            }();
-            // calculate dst data index
-            constexpr auto dst_data_idx = [&]() {
-                Index ordered_idx;
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i]
-                                         ? ordered_access_idx[i]
-                                         : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
-                });
-                return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                       dst_scalar_per_access;
-            }();
-            typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
-            using dst_vector_t =
-                typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
-            // load dst0 and apply elementwise operation
-            {
-                // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
-                // TODO: fix this
-                static_assert(DstScalarPerVector == 1, "wrong!");
-                // copy data from src_buf into dst_vector_src_data
-                constexpr index_t src_offset =
-                    src_desc.CalculateOffset(src_slice_origin_idx + dst_data_idx);
-                const SrcData src_v = src_buf[Number<src_offset>{}];
-                // load dst0
-                const bool is_dst0_valid =
-                    coordinate_has_valid_offset_assuming_visible_index_is_valid(dst0_desc,
-                                                                                dst0_coord_);
-                const DstData dst0_v =
-                    dst0_buf.template Get<DstData>(dst0_coord_.GetOffset(), is_dst0_valid);
-#if !CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE
-                // apply element-wise operation in SrcData type
-                const SrcData dst_v = dst_element_op_(src_v, type_convert<SrcData>(dst0_v));
-                // apply type convert
-                dst_vector.template AsType<DstData>()(Number<0>{}) = type_convert<DstData>(dst_v);
-#else
-                // apply element-wise operation in DstData type
-                const DstData dst_v = dst_element_op_(src_v, dst0_v);
-                dst_vector.template AsType<DstData>()(Number<0>{}) = dst_v;
-#endif
-            }
-            const bool is_dst_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
-            // copy data from dst_vector into dst_buf
-            if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
-            {
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
-            {
-                dst_buf.template AtomicAdd<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add)
-            {
-                typename vector_type_maker<DstData, DstScalarPerVector>::type tmp;
-                tmp.template AsType<dst_vector_t>()(Number<0>{}) =
-                    dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid);
-                static_for<0, DstScalarPerVector, 1>{}([&](auto t) {
-                    dst_vector.template AsType<DstData>()(t) += tmp.template AsType<DstData>()[t];
-                });
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            }
-            constexpr auto move_on_dim = [&]() constexpr
-            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
-                    });
-                });
-                return move_on_dim_;
-            }
-            ();
-            // move
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
-                        // dst0
-                        move_tensor_coordinate(
-                            dst0_desc, dst0_coord_, dst0_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
-                        // dst0
-                        move_tensor_coordinate(
-                            dst0_desc, dst0_coord_, dst0_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
-        });
-        // move dst coordinate back to slice origin (or not)
-        if constexpr(DstResetCoordinateAfterRun)
-        {
-            const auto dst_reset_step =
-                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
-        }
-    }
-    template <typename SrcSliceOriginIdx,
-              typename SrcBuffer,
-              typename DstBuffer,
-              typename Dst0Buffer>
-    __device__ void Run(const SrcDesc&,
-                        const SrcSliceOriginIdx&,
-                        const SrcBuffer& src_buf,
-                        const DstDesc& dst_desc,
-                        DstBuffer& dst_buf,
-                        const Dst0Desc& dst0_desc,
-                        const Dst0Buffer& dst0_buf)
-    {
-        auto f_step_hacks = [&](auto desc) {
-            constexpr index_t ntransform = decltype(desc)::GetNumOfTransform();
-            constexpr auto zeros = typename uniform_sequence_gen<ntransform, 0>::type{};
-            constexpr auto step_hacks =
-                make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
-                           generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
-            return step_hacks;
-        };
-        Run(SrcDesc{},
-            SrcSliceOriginIdx{},
-            src_buf,
-            dst_desc,
-            dst_buf,
-            f_step_hacks(dst_desc),
-            dst0_desc,
-            dst0_buf,
-            f_step_hacks(dst0_desc));
-    }
-    __device__ static constexpr auto GetDstCoordinateResetStep()
-    {
-        constexpr auto I0 = Number<0>{};
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
-        constexpr auto dim_access_order = DimAccessOrder{};
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-            forward_sweep_(I0) = true;
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_access_lengths[I0] - 1;
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
-                });
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-            return forward_sweep_;
-        }();
-        // calculate dst data index after last iteration in Run(), if it has not being reset by
-        // RunWrite()
-        constexpr auto dst_data_idx = [&]() {
-            Index ordered_idx;
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   dst_scalar_per_access;
-        }();
-        //
-        constexpr auto reset_dst_data_step = [&]() {
-            Index reset_dst_data_step_;
-            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
-            return reset_dst_data_step_;
-        }();
-        return reset_dst_data_step;
-    }
-    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
-                                       const Index& dst_slice_origin_step_idx)
-    {
-        // if dst coord was not reset by Run(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
-                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
-        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
-    }
-    private:
-    DstCoord dst_coord_;
-    Dst0Coord dst0_coord_;
-    const DstElementwiseOperation dst_element_op_;
-}; // namespace ck
-} // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -48,7 +48,7 @@ struct lambda_scalar_per_access_for_src_and_dst
 template <typename SliceLengths,
          typename SrcElementwiseOperation,
          typename DstElementwiseOperation,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          typename SrcData,
          typename DstData,
          typename SrcDesc,
@@ -110,8 +110,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                            const SrcBuffer& src_buf,
                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
    {
-        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");
        static_assert(
@@ -271,7 +271,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        static_ford<SliceLengths>{}([&](auto idx) {
            // convert from SrcData to DstData here
            dst_thread_scratch_(idx) =
-                type_convert<DstData>(src_thread_scratch_tuple[thread_scratch_id][idx]);
+                type_convert<DstData>(src_thread_scratch_tuple_[thread_scratch_id][idx]);
        });
 #else
        // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
@@ -361,8 +361,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        // TODO move this elsewhere
        TransferDataFromSrcThreadScratchToDstThreadScratch(thread_scratch_id);
-        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");
        static_assert(
@@ -763,13 +763,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
    static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
    static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
-    using SrcThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
+    using SrcThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
                                                             SrcData,
                                                             SrcScalarPerVector,
                                                             decltype(src_thread_scratch_desc_),
                                                             true>;
-    using DstThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
+    using DstThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
                                                             DstData,
                                                             DstScalarPerVector,
                                                             decltype(dst_thread_scratch_desc_),

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
@@ -48,7 +48,7 @@ struct lambda_scalar_per_access_for_src_and_dst
 template <typename SliceLengths,
          typename SrcElementwiseOperation,
          typename DstElementwiseOperation,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          typename SrcData,
          typename DstData,
          typename SrcDesc,
@@ -120,8 +120,8 @@ struct ThreadwiseTensorSliceTransfer_v3r3
    template <typename SrcBuffer>
    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
    {
-        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");
        static_assert(
@@ -369,8 +369,8 @@ struct ThreadwiseTensorSliceTransfer_v3r3
        // TODO move this elsewhere
        TransferDataFromSrcThreadScratchToDstThreadScratch();
-        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");
        static_assert(
@@ -859,14 +859,14 @@ struct ThreadwiseTensorSliceTransfer_v3r3
    static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
    static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
-    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
+    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
                                    SrcData,
                                    SrcScalarPerVector,
                                    decltype(src_thread_scratch_desc_),
                                    true>
        src_thread_scratch_;
-    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
+    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
                                    DstData,
                                    DstScalarPerVector,
                                    decltype(dst_thread_scratch_desc_),

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
@@ -13,7 +13,7 @@ namespace ck {
 //   3. src_slice_origin and dst_slice_origin are not known at compile-time,
 //   4. Use thread buffer
 template <typename SliceLengths,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          typename SrcData,
          typename DstData,
          typename SrcDesc,
@@ -76,8 +76,8 @@ struct ThreadwiseTensorSliceTransfer_v5r1
    __device__ void
    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
    {
-        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");
        static_assert(
@@ -244,8 +244,8 @@ struct ThreadwiseTensorSliceTransfer_v5r1
    __device__ void
    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
    {
-        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
                      "wrong!");
        static_assert(
@@ -602,7 +602,7 @@ struct ThreadwiseTensorSliceTransfer_v5r1
    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
-    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
+    StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
    SrcCoord src_coord_;
    DstCoord dst_coord_;

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
@@ -4,6 +4,7 @@
 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
+#include "tensor_space_filling_curve.hpp"
 namespace ck {
@@ -28,7 +29,7 @@ template <typename SrcData,
          typename DimAccessOrder,
          index_t VectorDim,
          index_t ScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          bool SrcResetCoordinateAfterRun,
          bool DstResetCoordinateAfterRun>
 struct ThreadwiseTensorSliceTransfer_v6r1
@@ -40,9 +41,6 @@ struct ThreadwiseTensorSliceTransfer_v6r1
    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
-    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
-    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
    static constexpr auto I0 = Number<0>{};
    __device__ constexpr ThreadwiseTensorSliceTransfer_v6r1(const SrcDesc& src_desc,
@@ -79,70 +77,14 @@ struct ThreadwiseTensorSliceTransfer_v6r1
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
-        constexpr auto dim_access_order = DimAccessOrder{};
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        auto make_forward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index forward_step_idx;
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0;
-                    });
-                    return make_tensor_coordinate_step(desc, forward_step_idx);
-                },
-                Number<nDim>{});
-        };
-        auto make_backward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index backward_step_idx;
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0;
-                    });
-                    return make_tensor_coordinate_step(desc, backward_step_idx);
-                },
-                Number<nDim>{});
-        };
-        // make forward steps
-        const auto src_forward_steps = make_forward_steps(src_desc);
-        const auto dst_forward_steps = make_forward_steps(dst_desc);
-        // make backward steps
-        const auto src_backward_steps = make_backward_steps(src_desc);
-        const auto dst_backward_steps = make_backward_steps(dst_desc);
-        // loop over slice window
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-                forward_sweep_(I0) = true;
+        // loop over space-filling curve
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_access_idx[I0];
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-                return forward_sweep_;
-            }();
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
            using src_vector_type = vector_type_maker_t<SrcData, ScalarPerVector>;
            using src_vector_t    = typename src_vector_type::type;
@@ -168,59 +110,20 @@ struct ThreadwiseTensorSliceTransfer_v6r1
                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
            // copy data from dst_vector into dst_buf
-            if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
-            {
+                dst_coord_.GetOffset(),
-                dst_buf.template Set<dst_vector_t>(
+                is_dst_valid,
-                    dst_coord_.GetOffset(),
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
-            {
-                dst_buf.template AtomicAdd<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
-            constexpr auto move_on_dim = [&]() constexpr
+            // move coordinate
+            if constexpr(idx_1d.value != num_access - 1)
            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
-                static_for<0, nDim, 1>{}([&](auto i) {
+                    src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
-                    });
-                });
-                return move_on_dim_;
            }
-            ();
-            // move coordinate
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
        });
        // move coordinate back to slice origin (or not)
@@ -243,59 +146,25 @@ struct ThreadwiseTensorSliceTransfer_v6r1
    __device__ static constexpr auto GetCoordinateResetStep()
    {
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
-        constexpr auto dim_access_order = DimAccessOrder{};
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-            forward_sweep_(I0) = true;
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_access_lengths[I0] - 1;
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
-                });
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-            return forward_sweep_;
-        }();
-        // calculate data index after last iteration in Run(), if it has not being reset
-        constexpr auto data_idx = [&]() {
-            Index ordered_idx;
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   scalar_per_access;
-        }();
-        //
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
-        constexpr auto reset_data_step = [&]() {
+        if constexpr(num_access == 0)
-            Index reset_data_step_;
+        {
+            return typename SpaceFillingCurve::Index{};
-            static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; });
+        }
+        else
-            return reset_data_step_;
+        {
-        }();
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
-        return reset_data_step;
+            return reset_step;
+        }
    }
    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
@@ -332,7 +201,7 @@ struct ThreadwiseTensorSliceTransfer_v6r1
    SrcCoord src_coord_;
    DstCoord dst_coord_;
    const ElementwiseOperation element_op_;
-};
+}; // namespace ck
 } // namespace ck
 #endif
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
@@ -4,6 +4,7 @@
 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
+#include "tensor_space_filling_curve.hpp"
 namespace ck {
@@ -30,7 +31,7 @@ template <typename Src0Data,
          typename DimAccessOrder,
          index_t VectorDim,
          index_t ScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          bool Src0ResetCoordinateAfterRun,
          bool Src1ResetCoordinateAfterRun,
          bool DstResetCoordinateAfterRun>
@@ -44,10 +45,6 @@ struct ThreadwiseTensorSliceTransfer_v6r2
    using Src1Coord = decltype(make_tensor_coordinate(Src1Desc{}, Index{}));
    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
-    using Src0CoordStep = decltype(make_tensor_coordinate_step(Src0Desc{}, Index{}));
-    using Src1CoordStep = decltype(make_tensor_coordinate_step(Src1Desc{}, Index{}));
-    using DstCoordStep  = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
    static constexpr auto I0 = Number<0>{};
    __device__ constexpr ThreadwiseTensorSliceTransfer_v6r2(const Src0Desc& src0_desc,
@@ -96,72 +93,14 @@ struct ThreadwiseTensorSliceTransfer_v6r2
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
-        constexpr auto dim_access_order = DimAccessOrder{};
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        auto make_forward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index forward_step_idx;
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0;
-                    });
-                    return make_tensor_coordinate_step(desc, forward_step_idx);
-                },
-                Number<nDim>{});
-        };
-        auto make_backward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index backward_step_idx;
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0;
-                    });
-                    return make_tensor_coordinate_step(desc, backward_step_idx);
-                },
-                Number<nDim>{});
-        };
-        // make forward steps
-        const auto src0_forward_steps = make_forward_steps(src0_desc);
-        const auto src1_forward_steps = make_forward_steps(src1_desc);
-        const auto dst_forward_steps  = make_forward_steps(dst_desc);
-        // make backward steps
-        const auto src0_backward_steps = make_backward_steps(src0_desc);
-        const auto src1_backward_steps = make_backward_steps(src1_desc);
-        const auto dst_backward_steps  = make_backward_steps(dst_desc);
-        // loop over slice window
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-                forward_sweep_(I0) = true;
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_access_idx[I0];
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-                return forward_sweep_;
-            }();
+        // loop over space-filling curve
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
            using src0_vector_type = vector_type_maker_t<Src0Data, ScalarPerVector>;
            using src0_vector_t    = typename src0_vector_type::type;
@@ -197,65 +136,22 @@ struct ThreadwiseTensorSliceTransfer_v6r2
                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
            // copy data from dst_vector into dst_buf
-            if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
-            {
+                dst_coord_.GetOffset(),
-                dst_buf.template Set<dst_vector_t>(
+                is_dst_valid,
-                    dst_coord_.GetOffset(),
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
-            {
-                dst_buf.template AtomicAdd<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
-            constexpr auto move_on_dim = [&]() constexpr
+            // move coordinate
+            if constexpr(idx_1d.value != num_access - 1)
            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
-                static_for<0, nDim, 1>{}([&](auto i) {
+                    src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step));
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
+                move_tensor_coordinate(
+                    src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                move_tensor_coordinate(
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
-                    });
-                });
-                return move_on_dim_;
            }
-            ();
-            // move coordinate
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            src0_desc, src0_coord_, src0_forward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            src1_desc, src1_coord_, src1_forward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            src0_desc, src0_coord_, src0_backward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            src1_desc, src1_coord_, src1_backward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
        });
        // move coordinate back to slice origin (or not)
@@ -286,59 +182,25 @@ struct ThreadwiseTensorSliceTransfer_v6r2
    __device__ static constexpr auto GetCoordinateResetStep()
    {
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
-        constexpr auto dim_access_order = DimAccessOrder{};
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-            forward_sweep_(I0) = true;
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_access_lengths[I0] - 1;
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
-                });
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-            return forward_sweep_;
-        }();
-        // calculate data index after last iteration in Run(), if it has not being reset
-        constexpr auto data_idx = [&]() {
-            Index ordered_idx;
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   scalar_per_access;
-        }();
-        //
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
-        constexpr auto reset_data_step = [&]() {
+        if constexpr(num_access == 0)
-            Index reset_data_step_;
+        {
+            return typename SpaceFillingCurve::Index{};
-            static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; });
+        }
+        else
-            return reset_data_step_;
+        {
-        }();
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
-        return reset_data_step;
+            return reset_step;
+        }
    }
    // src_slice_origin_step_idx need to be known at compile-time, for performance reason

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
@@ -4,6 +4,7 @@
 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
+#include "tensor_space_filling_curve.hpp"
 namespace ck {
@@ -32,7 +33,7 @@ template <typename Src0Data,
          typename DimAccessOrder,
          index_t VectorDim,
          index_t ScalarPerVector,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          bool Src0ResetCoordinateAfterRun,
          bool Src1ResetCoordinateAfterRun,
          bool Src2ResetCoordinateAfterRun,
@@ -48,11 +49,6 @@ struct ThreadwiseTensorSliceTransfer_v6r3
    using Src2Coord = decltype(make_tensor_coordinate(Src2Desc{}, Index{}));
    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
-    using Src0CoordStep = decltype(make_tensor_coordinate_step(Src0Desc{}, Index{}));
-    using Src1CoordStep = decltype(make_tensor_coordinate_step(Src1Desc{}, Index{}));
-    using Src2CoordStep = decltype(make_tensor_coordinate_step(Src2Desc{}, Index{}));
-    using DstCoordStep  = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
    static constexpr auto I0 = Number<0>{};
    __device__ constexpr ThreadwiseTensorSliceTransfer_v6r3(const Src0Desc& src0_desc,
@@ -112,74 +108,14 @@ struct ThreadwiseTensorSliceTransfer_v6r3
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
-        constexpr auto dim_access_order = DimAccessOrder{};
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        auto make_forward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index forward_step_idx;
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0;
-                    });
-                    return make_tensor_coordinate_step(desc, forward_step_idx);
-                },
-                Number<nDim>{});
-        };
-        auto make_backward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index backward_step_idx;
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0;
-                    });
-                    return make_tensor_coordinate_step(desc, backward_step_idx);
-                },
-                Number<nDim>{});
-        };
-        // make forward steps
-        const auto src0_forward_steps = make_forward_steps(src0_desc);
-        const auto src1_forward_steps = make_forward_steps(src1_desc);
-        const auto src2_forward_steps = make_forward_steps(src2_desc);
-        const auto dst_forward_steps  = make_forward_steps(dst_desc);
-        // make backward steps
-        const auto src0_backward_steps = make_backward_steps(src0_desc);
-        const auto src1_backward_steps = make_backward_steps(src1_desc);
-        const auto src2_backward_steps = make_backward_steps(src2_desc);
-        const auto dst_backward_steps  = make_backward_steps(dst_desc);
-        // loop over slice window
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-                forward_sweep_(I0) = true;
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_access_idx[I0];
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-                return forward_sweep_;
-            }();
+        // loop over space-filling curve
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
            using src0_vector_type = vector_type_maker_t<Src0Data, ScalarPerVector>;
            using src0_vector_t    = typename src0_vector_type::type;
@@ -224,72 +160,24 @@ struct ThreadwiseTensorSliceTransfer_v6r3
            const bool is_dst_valid =
                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
-            // copy data from dst_vector into dst_buf
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
-            if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
+                dst_coord_.GetOffset(),
-            {
+                is_dst_valid,
-                dst_buf.template Set<dst_vector_t>(
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
-            {
-                dst_buf.template AtomicAdd<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
-            constexpr auto move_on_dim = [&]() constexpr
+            // move coordinate
+            if constexpr(idx_1d.value != num_access - 1)
            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
-                static_for<0, nDim, 1>{}([&](auto i) {
+                    src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step));
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
+                move_tensor_coordinate(
+                    src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                move_tensor_coordinate(
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
+                    src2_desc, src2_coord_, make_tensor_coordinate_step(src2_desc, forward_step));
-                    });
+                move_tensor_coordinate(
-                });
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
-                return move_on_dim_;
            }
-            ();
-            // move coordinate
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            src0_desc, src0_coord_, src0_forward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            src1_desc, src1_coord_, src1_forward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            src2_desc, src2_coord_, src2_forward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            src0_desc, src0_coord_, src0_backward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            src1_desc, src1_coord_, src1_backward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            src2_desc, src2_coord_, src2_backward_steps[dim_access_order[i]]);
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
        });
        // move coordinate back to slice origin (or not)
@@ -328,59 +216,25 @@ struct ThreadwiseTensorSliceTransfer_v6r3
    __device__ static constexpr auto GetCoordinateResetStep()
    {
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
-        constexpr auto dim_access_order = DimAccessOrder{};
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-            forward_sweep_(I0) = true;
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_access_lengths[I0] - 1;
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
-                });
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-            return forward_sweep_;
-        }();
-        // calculate data index after last iteration in Run(), if it has not being reset
-        constexpr auto data_idx = [&]() {
-            Index ordered_idx;
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   scalar_per_access;
-        }();
-        //
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
-        constexpr auto reset_data_step = [&]() {
+        if constexpr(num_access == 0)
-            Index reset_data_step_;
+        {
+            return typename SpaceFillingCurve::Index{};
-            static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; });
+        }
+        else
-            return reset_data_step_;
+        {
-        }();
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
-        return reset_data_step;
+            return reset_step;
+        }
    }
    // src_slice_origin_step_idx need to be known at compile-time, for performance reason

--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -476,7 +476,7 @@ struct MfmaSelector
    template <>
    static constexpr auto GetMfma<bhalf_t, 32, 32>()
    {
-#if defined(CK_AMD_GPU_GFX90A)
+#if defined(CK_USE_AMD_MFMA_BF16_1K_OP)
        return MfmaInstr::mfma_f32_32x32x8bf16_1k;
 #else
        return MfmaInstr::mfma_f32_32x32x4bf16;
@@ -486,7 +486,7 @@ struct MfmaSelector
    template <>
    static constexpr auto GetMfma<bhalf_t, 16, 16>()
    {
-#if defined(CK_AMD_GPU_GFX90A)
+#if defined(CK_USE_AMD_MFMA_BF16_1K_OP)
        return MfmaInstr::mfma_f32_16x16x16bf16_1k;
 #else
        return MfmaInstr::mfma_f32_16x16x8bf16;

--- a/include/ck/utility/amd_address_space.hpp
+++ b/include/ck/utility/amd_address_space.hpp
@@ -9,7 +9,7 @@
 namespace ck {
-enum AddressSpaceEnum_t
+enum struct AddressSpaceEnum
 {
    Generic,
    Global,
@@ -19,7 +19,7 @@ enum AddressSpaceEnum_t
 };
 template <typename T>
-__device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p)
+__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p)
 {
    // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
    // only c-style pointer cast seems be able to be compiled
@@ -30,13 +30,13 @@ __device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p)
 }
 template <typename T>
-__host__ __device__ T CONSTANT* cast_pointer_to_constant_address_space(T* p)
+__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p)
 {
    // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
    // only c-style pointer cast seems be able to be compiled
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wold-style-cast"
-    return (T CONSTANT*)p; // NOLINT(old-style-cast)
+    return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
 #pragma clang diagnostic pop
 }

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
-#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
+#pragma once
-#define CK_AMD_BUFFER_ADDRESSING_HPP
 #include "data_type.hpp"
 namespace ck {
@@ -87,6 +85,7 @@ llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
 // buffer load fp16
 __device__ half_t
 llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
@@ -212,6 +211,7 @@ llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
 // buffer store fp32
 __device__ void
 llvm_amdgcn_raw_buffer_store_fp32(float vdata,
@@ -233,6 +233,7 @@ llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
 // buffer atomic-add fp16
 __device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
    half2_t vdata,
@@ -637,19 +638,19 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
        }
        else if constexpr(N == 2)
        {
-            llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data,
+            llvm_amdgcn_raw_buffer_store_i16x2(src_thread_data,
-                                                dst_wave_buffer_resource,
+                                               dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
+                                               dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
+                                               dst_wave_addr_offset,
-                                                0);
+                                               0);
        }
        else if constexpr(N == 4)
        {
-            llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data,
+            llvm_amdgcn_raw_buffer_store_i16x4(src_thread_data,
-                                                dst_wave_buffer_resource,
+                                               dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
+                                               dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
+                                               dst_wave_addr_offset,
-                                                0);
+                                               0);
        }
        else if constexpr(N == 8)
        {
@@ -1046,4 +1047,3 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
 }
 } // namespace ck
-#endif