improving index calculation

77c81617 · Chao Liu · f2f35201 · 77c81617 · 77c81617 · 77c81617
Commit 77c81617 authored Jan 16, 2021 by Chao Liu
11 changed files
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -34,7 +34,7 @@ template <index_t BlockSize,
          index_t GemmBBlockTransferSrcScalarPerVector_GemmN,
          index_t GemmBBlockTransferDstScalarPerVector_GemmN,
          index_t GemmCThreadTransferDstScalarPerVector_GemmN1>
-struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
+struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
 {
    template <typename... Wei, typename... In, typename... Out>
    __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
@@ -96,18 +96,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
        // input tensor
        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
-            transform_dynamic_tensor_descriptor(
            in_n_c_hi_wi_global_desc,
            make_tuple(DynamicPassThrough{N},
                       DynamicPassThrough{C},
-                           DynamicLeftPad{Hi, InLeftPadH},
+                       DynamicPad{Hi, InLeftPadH, InRightPadH},
-                           DynamicLeftPad{Wi, InLeftPadW}),
+                       DynamicPad{Wi, InLeftPadW, InRightPadW}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
-            make_tuple(DynamicPassThrough{N},
-                       DynamicPassThrough{C},
-                       DynamicRightPad{Hi + InLeftPadH, InRightPadH},
-                       DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
@@ -164,6 +157,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
        const index_t GemmM0 = GemmM / GemmM1;
        const index_t GemmN0 = GemmN / GemmN1;
+#if 1 // debug
        const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
            transform_dynamic_tensor_descriptor(
                out_gemmm_gemmn_global_desc,
@@ -171,6 +165,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
                           DynamicUnMerge<2>{make_multi_index(GemmN0, GemmN1)}),
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+#else
+        const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
+            transform_dynamic_tensor_descriptor(
+                out_gemmm_gemmn_global_desc,
+                make_tuple(
+                    HackSemiDynamicUnMerge<3, Sequence<GemmM1>>{make_multi_index(1, GemmM0)},
+                    HackSemiDynamicUnMerge<3, Sequence<GemmN1>>{make_multi_index(1, GemmN0)}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+#endif
        // GEMM
        using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1<

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -93,6 +93,103 @@ struct DynamicPassThrough
    }
 };
+template <bool SkipIsValidCheck = false>
+struct DynamicPad
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    const UpperIndex up_lengths_;
+    const index_t left_pad_;
+    const index_t right_pad_;
+#if 0
+    __host__ __device__   constexpr DynamicPad(const DynamicPad&) = default;
+    __host__ __device__   constexpr DynamicPad(DynamicPad&&) = default;
+#else
+    __host__ __device__ constexpr DynamicPad(const DynamicPad& other)
+        : up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
+    {
+    }
+    __host__ __device__ constexpr DynamicPad(DynamicPad&& other)
+        : up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
+    {
+    }
+#endif
+    __host__ __device__ constexpr DynamicPad(const index_t& low_length,
+                                             const index_t& left_pad,
+                                             const index_t& right_pad)
+        : up_lengths_{make_multi_index(low_length + left_pad + right_pad)},
+          left_pad_{left_pad},
+          right_pad_{right_pad}
+    {
+    }
+    __host__ __device__ constexpr DynamicPad() : up_lengths_{0}, left_pad_{0}, right_pad_{0} {}
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
+    }
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
+    __host__ __device__ static constexpr void
+    CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
+                            const UpIdxDiff& idx_diff_up,
+                            const LowIdx& /* idx_low_old */,
+                            const UpIdx& /* idx_up_old */)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+                                                                 const UpIdxDiff& idx_diff_up,
+                                                                 const LowIdx& idx_low_old,
+                                                                 const UpIdx& idx_up_old,
+                                                                 Number<Hack>)
+    {
+        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        return SkipIsValidCheck || ((idx_up[Number<0>{}] >= left_pad_) &&
+                                    (idx_up[Number<0>{}] < up_lengths_[Number<0>{}] - right_pad_));
+    }
+};
 template <bool SkipIsValidCheck = false>
 struct DynamicLeftPad
 {
@@ -546,15 +643,17 @@ struct DynamicMerge
    }
    // idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
+    //
    // If idx_diff_up is known at compile-time, many calculations can be optimized
    // away by compiler
    // This function assume idx_low_old is not out-of-bound
+    // this version save computation but use more register
    template <typename LowIdxDiff,
              typename UpIdxDiff,
              typename LowIdx,
              typename UpIdx,
              index_t Hack>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack_1(LowIdxDiff& idx_diff_low,
                                                                      const UpIdxDiff& idx_diff_up,
                                                                      const LowIdx& idx_low_old,
                                                                      const UpIdx& /* idx_up_old */,
@@ -579,7 +678,10 @@ struct DynamicMerge
        // computed at
        //   run-time each time this function is called, and can be very expensive.
        LowerIndex idx_diff_low_const;
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
        index_t tmp = idx_diff_up[Number<0>{}];
        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
@@ -587,10 +689,6 @@ struct DynamicMerge
            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
        });
-        LowerIndex idx_low_length_minus_idx_diff_low_const;
-        LowerIndex idx_low_length_plus_idx_diff_low_const;
-#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
        static_for<0, NDimLow, 1>{}([&](auto i) {
@@ -600,6 +698,13 @@ struct DynamicMerge
        });
 #else
        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
        static_for<0, NDimLow, 1>{}([&](auto i) {
@@ -633,6 +738,27 @@ struct DynamicMerge
            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
        }
        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t borrow = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low_old[i] - borrow;
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) -= borrow;
+                borrow = do_borrow ? 1 : 0;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
+        }
+        else
        {
            // do carry check on each low dimension in reversed order
            // do not need to check the first dimension
@@ -641,26 +767,127 @@ struct DynamicMerge
            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
                index_t idx_low_tmp = idx_low_old[i] + carry;
+                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
                idx_diff_low(i) =
                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
                carry = do_borrow ? -1 : carry;
            });
            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
        }
+    }
+    // idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
+    // If idx_diff_up is known at compile-time, many calculations can be optimized
+    // away by compiler
+    // This function assume idx_low_old is not out-of-bound
+    // this version use less register but more computation
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack_2(LowIdxDiff& idx_diff_low,
+                                                                      const UpIdxDiff& idx_diff_up,
+                                                                      const LowIdx& idx_low_old,
+                                                                      const UpIdx& /* idx_up_old */,
+                                                                      Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_diff_low_const;
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+#endif
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            bool do_carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
+                index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] + do_carry;
+                do_carry = idx_low_tmp >= low_lengths_[i];
+                idx_diff_low(i) = do_carry ? idx_diff_low(i) - low_lengths_[i] : idx_diff_low[i];
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + do_carry;
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            bool do_borrow = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
+                index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] - do_borrow;
+                do_borrow = idx_low_tmp < 0;
+                idx_diff_low(i) = do_borrow ? idx_diff_low(i) + low_lengths_[i] : idx_diff_low[i];
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - do_borrow;
+        }
        else
        {
+#if 0
            // do carry check on each low dimension in reversed order
            // do not need to check the first dimension
            index_t carry = 0;
            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
-                index_t idx_low_tmp = idx_low_old[i] + carry;
+                idx_diff_low(i) = idx_diff_low_const[i] + carry;
                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
@@ -677,9 +904,191 @@ struct DynamicMerge
            });
            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+#endif
        }
    }
+    // idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
+    // If idx_diff_up is known at compile-time, many calculations can be optimized
+    // away by compiler
+    // This function assume idx_low_old is not out-of-bound
+    // this version use less register but more computation
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack_3(LowIdxDiff& idx_diff_low,
+                                                                      const UpIdxDiff& idx_diff_up,
+                                                                      const LowIdx& idx_low_old,
+                                                                      const UpIdx& /* idx_up_old */,
+                                                                      Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_diff_low_const;
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+#endif
+        if constexpr(Hack == 1)
+        {
+#if 1
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            bool do_carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
+                index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] + do_carry;
+                do_carry = idx_low_tmp >= low_lengths_[i];
+                idx_diff_low(i) = do_carry ? idx_diff_low(i) - low_lengths_[i] : idx_diff_low[i];
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + do_carry;
+#else
+            LowerIndex idx_low_new = idx_low_old;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                auto i_m1 = i - Number<1>{};
+                int64_t exec_mask;
+                int64_t do_carry;
+                idx_low_new(i) = idx_diff_low_const[i] + idx_low_old[i];
+                asm volatile(
+                    "\n \
+                     s_mov_b64 %0, exec \n \
+                     v_cmpx_le_u32_e64 %1, %4, %2 \n \
+                     v_subrev_u32 %2, %4, %2\n \
+                     v_add_u32 %3, %3, 1\n \
+                     s_mov_b64 exec, %0\n \
+                    "
+                    : "=s"(exec_mask), "=s"(do_carry), "=v"(idx_low_new(i)), "=v"(idx_low_new(i_m1))
+                    : "s"(low_lengths_[i]), "2"(idx_low_new[i]), "3"(idx_low_new[i_m1]));
+                idx_diff_low(i) = idx_low_new[i] - idx_low_old[i];
+            });
+            constexpr auto I0 = Number<0>{};
+            idx_low_new(I0) += idx_diff_low_const[I0];
+            idx_diff_low(I0) = idx_low_new[I0] - idx_low_old[I0];
+#endif
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            bool do_borrow = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
+                index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] - do_borrow;
+                do_borrow = idx_low_tmp < 0;
+                idx_diff_low(i) = do_borrow ? idx_diff_low(i) + low_lengths_[i] : idx_diff_low[i];
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - do_borrow;
+        }
+        else
+        {
+#if 0
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                idx_diff_low(i) = idx_diff_low_const[i] + carry;
+                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
+                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
+                carry = do_borrow ? -1 : carry;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+#endif
+        }
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+                                                                    const UpIdxDiff& idx_diff_up,
+                                                                    const LowIdx& idx_low_old,
+                                                                    const UpIdx& idx_up_old,
+                                                                    Number<Hack>) const
+    {
+#if 0
+        // this version save computation but use more register
+        CalculateLowerIndexDiff_hack_1(
+            idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
+#elif 1
+        // this version use less register but more computation
+        CalculateLowerIndexDiff_hack_2(
+            idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
+#elif 1
+        // this version use less register but more computation
+        CalculateLowerIndexDiff_hack_3(
+            idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
+#endif
+    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
@@ -695,7 +1104,7 @@ struct DynamicMerge
    }
 };
-template <index_t NDimUp>
+template <index_t NDimUp, bool Use24BitIntegerCalculation = false>
 struct DynamicUnMerge
 {
    using LowerIndex = MultiIndex<1>;
@@ -726,12 +1135,26 @@ struct DynamicUnMerge
    template <typename LowIdx, typename UpIdx>
    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
                                                           const UpIdx& idx_up) const
+    {
+        if constexpr(!Use24BitIntegerCalculation)
        {
            idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
            static_for<0, NDimUp - 1, 1>{}(
                [&](auto i) { idx_low(Number<0>{}) += idx_up[i] * up_lengths_scan_[i]; });
        }
+        else
+        {
+            idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
+            static_for<0, NDimUp - 1, 1>{}([&](auto i) {
+                idx_low(Number<0>{}) =
+                    (0x00ffffff & idx_low[Number<0>{}]) +
+                    (0x00ffffff & idx_up[i]) * (0x00ffffff & up_lengths_scan_[i]);
+            });
+        }
+    }
    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
    __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
@@ -836,5 +1259,118 @@ struct DynamicFreeze
    }
 };
+#if 0
+template <index_t NDimUp, typename StaticPartialUpLengths>
+struct HackSemiDynamicUnMerge
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<NDimUp>;
+    const UpperIndex up_lengths_;
+    const UpperIndex up_lengths_scan_;
+    static constexpr index_t NDimUpStatic  = StaticPartialUpLengths::Size();
+    static constexpr index_t NDimUpDynamic = NDimUp - NDimUpStatic;
+    const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_;
+    const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_scan_;
+    static constexpr auto static_partial_up_lengths_      = StaticPartialUpLengths{};
+    static constexpr auto static_partial_up_lengths_scan_ = reverse_exclusive_scan_sequence(
+        static_partial_up_lengths_, math::multiplies<index_t>(), Number<1>{});
+    __host__ __device__ constexpr HackSemiDynamicUnMerge(
+        const MultiIndex<NDimUpDynamic>& dynamic_partial_up_lengths)
+        : dynamic_partial_up_lengths_{dynamic_partial_up_lengths},
+          dynamic_partial_up_lengths_scan_{
+              container_reverse_exclusive_scan(dynamic_partial_up_lengths,
+                                               math::multiplies<index_t>(),
+                                               static_partial_up_lengths_scan_[Number<0>{}])}
+    {
+        static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
+                      "wrong! inconsisitent # of dimensions");
+    }
+    __host__ __device__ constexpr HackSemiDynamicUnMerge()
+        : up_lengths_{make_zero_multi_index<NDimUpDynamic>()},
+          up_lengths_scan_{make_zero_multi_index<NDimUpStatic>()}
+    {
+        static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
+                      "wrong! inconsisitent # of dimensions");
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
+    __host__ __device__ constexpr const auto GetUpperLengths() const
+    {
+        UpperIndex up_lengths;
+        static_for<0, NDimUpDynamic, 1>{}(
+            [&](auto i) { up_lengths(i) = dynamic_partial_up_lengths_[i]; });
+        static_for<0, NDimUpStatic, 1>{}([&](auto i) {
+            up_lengths(i + Number<NDimUpDynamic>{}) = static_partial_up_lengths_[i];
+        });
+        return up_lengths;
+    }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
+        static_for<0, NDimUpDynamic, 1>{}([&](auto i) {
+            idx_low(Number<0>{}) += idx_up[i] * dynamic_partial_up_lengths_scan_[i];
+        });
+        static_for<NDimUpDynamic, NDimUp - 1, 1>{}([&](auto i) {
+            idx_low(Number<0>{}) +=
+                idx_up[i] * static_partial_up_lengths_scan_[i + Number<NDimUpDynamic>{}];
+        });
+    }
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
+                                                               const UpIdxDiff& idx_diff_up,
+                                                               const LowIdx& /* idx_low_old */,
+                                                               const UpIdx& /* idx_up_old */) const
+    {
+        CalculateLowerIndex(idx_diff_low, idx_diff_up);
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+                                                                    const UpIdxDiff& idx_diff_up,
+                                                                    const LowIdx& idx_low_old,
+                                                                    const UpIdx& idx_up_old,
+                                                                    Number<Hack>) const
+    {
+        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+};
+#endif
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
@@ -96,162 +96,39 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
        }
    }
-    __device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst)
+    __device__ void RunRead_hack(const SrcDesc& src_desc, const SrcData* p_src)
-    {
-        if(BlockSize == thread_cluster_desc_.GetElementSize() or
-           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
-        {
-            threadwise_transfer_.RunWrite(dst_desc, p_dst);
-        }
-    }
-    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
-    {
-        if(BlockSize == thread_cluster_desc_.GetElementSize() or
-           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
-        {
-            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
-        }
-    }
-    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
    {
        if(BlockSize == thread_cluster_desc_.GetElementSize() or
           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
        {
-            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+            threadwise_transfer_.RunRead_hack(src_desc, p_src);
        }
    }
-    static constexpr auto thread_cluster_desc_ =
+    __device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst)
-        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
-    using ThreadwiseTransfer =
-        ThreadwiseDynamicTensorSliceTransfer_v3<ThreadSliceLengths,
-                                                DstInMemOp,
-                                                SrcData,
-                                                DstData,
-                                                SrcDesc,
-                                                DstDesc,
-                                                SrcDimAccessOrder,
-                                                DstDimAccessOrder,
-                                                SrcVectorDim,
-                                                DstVectorDim,
-                                                SrcScalarPerVector,
-                                                DstScalarPerVector,
-                                                SrcScalarStrideInVector,
-                                                DstScalarStrideInVector,
-                                                SrcAddressSpace,
-                                                DstAddressSpace,
-                                                ThreadTransferSrcResetCoordinateAfterRun,
-                                                ThreadTransferDstResetCoordinateAfterRun>;
-    ThreadwiseTransfer threadwise_transfer_;
-};
-// this version does following things to avoid scratch memory issue
-// 1. Use StaticallyIndexedArray instead of C array for thread buffer
-// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
-// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
-template <index_t BlockSize,
-          InMemoryDataOperation DstInMemOp,
-          typename BlockSliceLengths,
-          typename ThreadSliceLengths,
-          typename ThreadClusterLengths,
-          typename ThreadClusterArrangeOrder,
-          typename SrcData,
-          typename DstData,
-          typename SrcDesc,
-          typename DstDesc,
-          typename SrcDimAccessOrder,
-          typename DstDimAccessOrder,
-          index_t SrcVectorDim,
-          index_t DstVectorDim,
-          index_t SrcScalarPerVector,
-          index_t DstScalarPerVector,
-          AddressSpace SrcAddressSpace,
-          AddressSpace DstAddressSpace,
-          index_t SrcScalarStrideInVector,
-          index_t DstScalarStrideInVector,
-          index_t ThreadTransferSrcResetCoordinateAfterRun,
-          index_t ThreadTransferDstResetCoordinateAfterRun>
-struct BlockwiseDynamicTensorSliceTransfer_v4_hack
-{
-    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
-    using Index = MultiIndex<nDim>;
-    __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4_hack(
-        const SrcDesc& src_desc,
-        const Index& src_block_slice_origin,
-        const DstDesc& dst_desc,
-        const Index& dst_block_slice_origin)
-        : threadwise_transfer_(
-              src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
-    {
-        static_assert(nDim == remove_reference_t<remove_cv_t<SrcDesc>>::GetNumOfDimension() &&
-                          nDim == remove_reference_t<remove_cv_t<DstDesc>>::GetNumOfDimension() &&
-                          nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
-                          nDim == ThreadClusterLengths::Size() &&
-                          nDim == ThreadClusterArrangeOrder::Size() &&
-                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
-                      "wrong! nDim not consistent");
-        static_assert(
-            is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
-            "wrong! threads should be mapped to cover entire slicing window");
-        static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
-                      "wrong! BlockSize too small");
-        if(BlockSize == thread_cluster_desc_.GetElementSize() or
-           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
-        {
-            const auto thread_cluster_id =
-                thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
-            const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
-            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
-                                                   src_block_slice_origin + thread_data_id_begin);
-            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
-                                                   dst_block_slice_origin + thread_data_id_begin);
-        }
-    }
-    __device__ static constexpr auto CalculateThreadDataBegin()
-    {
-        const auto thread_cluster_id =
-            thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
-        return thread_cluster_id * ThreadSliceLengths{};
-    }
-    __device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src)
    {
        if(BlockSize == thread_cluster_desc_.GetElementSize() or
           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
        {
-            threadwise_transfer_.RunRead(src_desc, p_src);
+            threadwise_transfer_.RunWrite(dst_desc, p_dst);
        }
    }
-    __device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst)
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
    {
        if(BlockSize == thread_cluster_desc_.GetElementSize() or
           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
        {
-            threadwise_transfer_.RunWrite(dst_desc, p_dst);
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
        }
    }
-    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    __device__ void MoveSrcSliceWindow_hack(const SrcDesc& src_desc, const Index& step)
    {
        if(BlockSize == thread_cluster_desc_.GetElementSize() or
           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
        {
-            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+            threadwise_transfer_.MoveSrcSliceWindow_hack(src_desc, step);
        }
    }
@@ -268,7 +145,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v4_hack
        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
    using ThreadwiseTransfer =
-        ThreadwiseDynamicTensorSliceTransfer_v3_hack<ThreadSliceLengths,
+        ThreadwiseDynamicTensorSliceTransfer_v3<ThreadSliceLengths,
                                                DstInMemOp,
                                                SrcData,
                                                DstData,

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
@@ -166,7 +166,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
        // B matrix blockwise copy
        auto b_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4_hack<BlockSize,
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
                                                   InMemoryDataOperation::Set,
                                                   Sequence<KPerBlock, NPerBlock>,
                                                   BBlockTransferThreadSliceLengths_K_N,
@@ -258,16 +258,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0);
        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0);
-#if 1
        // LDS double buffer: preload data into LDS
        {
            a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global);
-            b_blockwise_copy.RunRead(b_k_n_global_desc, p_b_global);
+            b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global);
            a_blockwise_copy.RunWrite(a_k_m_block_desc, p_a_block_double);
            b_blockwise_copy.RunWrite(b_k_n_block_desc, p_b_block_double);
        }
-#endif
        if constexpr(HasMainKBlockLoop)
        {
@@ -285,13 +283,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
            {
                // even iteration
                a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step);
-                b_blockwise_copy.MoveSrcSliceWindow(b_k_n_global_desc, b_block_slice_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow_hack(b_k_n_global_desc,
+                                                         b_block_slice_copy_step);
                __syncthreads();
                // LDS doubel buffer: load next data from device mem
                a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global);
-                b_blockwise_copy.RunRead(b_k_n_global_desc, p_b_global);
+                b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global);
                // LDS double buffer: GEMM on current data
                blockwise_gemm.Run(p_a_block_even, p_b_block_even, p_c_thread);
@@ -302,13 +301,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
                // odd iteration
                a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step);
-                b_blockwise_copy.MoveSrcSliceWindow(b_k_n_global_desc, b_block_slice_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow_hack(b_k_n_global_desc,
+                                                         b_block_slice_copy_step);
                __syncthreads();
                // LDS doubel buffer: load next data from device mem
                a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global);
-                b_blockwise_copy.RunRead(b_k_n_global_desc, p_b_global);
+                b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global);
                // LDS double buffer: GEMM on current data
                blockwise_gemm.Run(p_a_block_odd, p_b_block_odd, p_c_thread);
@@ -326,13 +326,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
        {
            a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step);
-            b_blockwise_copy.MoveSrcSliceWindow(b_k_n_global_desc, b_block_slice_copy_step);
+            b_blockwise_copy.MoveSrcSliceWindow_hack(b_k_n_global_desc, b_block_slice_copy_step);
            __syncthreads();
            // LDS double buffer: load last data from device mem
            a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global);
-            b_blockwise_copy.RunRead(b_k_n_global_desc, p_b_global);
+            b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global);
            // LDS double buffer: GEMM on 2nd-last data
            blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread);
@@ -384,8 +384,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
                Float,
                decltype(c_m0_m1_n0_n1_thread_desc),
                decltype(c_m0_m1_n0_n1_global_desc),
+#if 1 // debug
                Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
                CThreadTransferSrcDstAccessOrder,
+#else
+                Sequence<1, 1, 2, 4>,
+                Sequence<0, 1, 2, 3>,
+#endif
                CThreadTransferSrcDstVectorDim,
                1,
                CThreadTransferDstScalarPerVector,
@@ -402,7 +407,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
                                       m_thread_data_on_global % M1,
                                       n_thread_data_on_global / N1,
                                       n_thread_data_on_global % N1))
-                .Run(c_m0_m1_n0_n1_thread_desc, p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
+                .Run_hack(
+                    c_m0_m1_n0_n1_thread_desc, p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
        }
    }

--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -25,7 +25,7 @@ template <typename SrcData,
          index_t SrcScalarStrideInVector,
          index_t DstScalarStrideInVector,
          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
-                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // Run(),  will be fused with MoveSrcSliceWindow to
                                           // save addr computation
          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
@@ -333,7 +333,307 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
            back_step(i) = (SliceLengths{}[i - Number<1>{}] % 2 == 0) ? 0 : (1 - SliceLengths{}[i]);
        });
        return back_step;
+    }
+    __device__ void
+    Run_hack(const SrcDesc& src_desc, const SrcData* p_src, const DstDesc& dst_desc, DstData* p_dst)
+    {
+        if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
+        {
+            // TODO use constexpr for coordinate-step to make sure compiler behave correctly
+            const auto src_step_0_p1 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
+            const auto src_step_0_m1 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
+            const auto src_step_p1_0 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
+            const auto src_step_m1_0 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
+            const auto dst_step_0_p1 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
+            const auto dst_step_0_m1 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
+            const auto dst_step_p1_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
+            const auto dst_step_m1_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
+            constexpr index_t Len0 = SliceLengths{}[0];
+            constexpr index_t Len1 = SliceLengths{}[1];
+#pragma unroll
+            for(index_t iter0 = 0; iter0 < Len0; ++iter0)
+            {
+#pragma unroll
+                for(index_t iter1 = 0; iter1 < Len1; ++iter1)
+                {
+                    // do work
+                    transfer_data<SrcData,
+                                  1,
+                                  SrcAddressSpace,
+                                  DstAddressSpace,
+                                  DstInMemOp,
+                                  SrcScalarStrideInVector,
+                                  DstScalarStrideInVector>(
+                        p_src,
+                        src_slice_origin_.GetOffset(),
+                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                            src_desc, src_slice_origin_),
+                        src_desc.GetElementSpaceSize(),
+                        p_dst,
+                        dst_slice_origin_.GetOffset(),
+                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                            dst_desc, dst_slice_origin_),
+                        dst_desc.GetElementSpaceSize());
+                    // move dim1 iterator
+                    if(iter1 < Len1 - 1)
+                    {
+                        bool forward_dim1 = (iter0 % 2 == 0);
+                        if(forward_dim1)
+                        {
+                            move_dynamic_tensor_coordinate(
+                                src_desc, src_slice_origin_, src_step_0_p1);
+                            move_dynamic_tensor_coordinate(
+                                dst_desc, dst_slice_origin_, dst_step_0_p1);
+                        }
+                        else
+                        {
+                            move_dynamic_tensor_coordinate(
+                                src_desc, src_slice_origin_, src_step_0_m1);
+                            move_dynamic_tensor_coordinate(
+                                dst_desc, dst_slice_origin_, dst_step_0_m1);
+                        }
+                    }
+                }
+                // move dim0 iterator
+                if(iter0 < Len0 - 1)
+                {
+                    move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
+                    move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0);
+                }
+            }
+        }
+        else if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 4)
+        {
+        // TODO use constexpr for coordinate-step to make sure compiler behave correctly
+#if 0
+            const auto src_step_0_0_0_p1 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
+            const auto src_step_0_0_0_m1 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
+            const auto src_step_0_0_p1_0 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
+            const auto src_step_0_0_m1_0 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
+            const auto src_step_0_p1_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
+            const auto src_step_0_m1_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
+            const auto src_step_p1_0_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
+            const auto src_step_m1_0_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
+            const auto dst_step_0_0_0_p1 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, 1));
+            const auto dst_step_0_0_0_m1 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, -1));
+            const auto dst_step_0_0_p1_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 1, 0));
+            const auto dst_step_0_0_m1_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, -1, 0));
+            const auto dst_step_0_p1_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
+            const auto dst_step_0_m1_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
+            const auto dst_step_p1_0_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
+            const auto dst_step_m1_0_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
+#else
+            // hack for output tensor
+            const auto src_step_0_0_0_p1 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
+            const auto src_step_0_0_0_m1 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
+            const auto src_step_0_0_p1_0 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
+            const auto src_step_0_0_m1_0 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
+            const auto src_step_0_p1_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
+            const auto src_step_0_m1_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
+            const auto src_step_p1_0_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
+            const auto src_step_m1_0_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
+            const auto dst_step_0_0_0_p1 = make_dynamic_tensor_coordinate_step_hack(
+                dst_desc, make_multi_index(0, 0, 0, 1), Sequence<0, 0, 1, 0, 0>{});
+            const auto dst_step_0_0_0_m1 = make_dynamic_tensor_coordinate_step_hack(
+                dst_desc, make_multi_index(0, 0, 0, -1), Sequence<0, 0, 2, 0, 0>{});
+            const auto dst_step_0_0_p1_0 = make_dynamic_tensor_coordinate_step_hack(
+                dst_desc, make_multi_index(0, 0, 1, 0), Sequence<0, 0, 1, 0, 0>{});
+            const auto dst_step_0_0_m1_0 = make_dynamic_tensor_coordinate_step_hack(
+                dst_desc, make_multi_index(0, 0, -1, 0), Sequence<0, 0, 2, 0, 0>{});
+            const auto dst_step_0_p1_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
+            const auto dst_step_0_m1_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
+            const auto dst_step_p1_0_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
+            const auto dst_step_m1_0_0_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
+#endif
+            constexpr index_t Len0 = SliceLengths{}[0];
+            constexpr index_t Len1 = SliceLengths{}[1];
+            constexpr index_t Len2 = SliceLengths{}[2];
+            constexpr index_t Len3 = SliceLengths{}[3];
+#pragma unroll
+            for(index_t iter0 = 0; iter0 < Len0; ++iter0)
+            {
+#pragma unroll
+                for(index_t iter1 = 0; iter1 < Len1; ++iter1)
+                {
+#pragma unroll
+                    for(index_t iter2 = 0; iter2 < Len2; ++iter2)
+                    {
+#pragma unroll
+                        for(index_t iter3 = 0; iter3 < Len3; ++iter3)
+                        {
+                            // do work
+                            transfer_data<SrcData,
+                                          1,
+                                          SrcAddressSpace,
+                                          DstAddressSpace,
+                                          DstInMemOp,
+                                          SrcScalarStrideInVector,
+                                          DstScalarStrideInVector>(
+                                p_src,
+                                src_slice_origin_.GetOffset(),
+                                coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                                    src_desc, src_slice_origin_),
+                                src_desc.GetElementSpaceSize(),
+                                p_dst,
+                                dst_slice_origin_.GetOffset(),
+                                coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                                    dst_desc, dst_slice_origin_),
+                                dst_desc.GetElementSpaceSize());
+                            // move dim1 iterator
+                            if(iter3 < Len3 - 1)
+                            {
+                                bool forward_dim3 = (iter2 % 2 == 0);
+                                if(forward_dim3)
+                                {
+                                    move_dynamic_tensor_coordinate(
+                                        src_desc, src_slice_origin_, src_step_0_0_0_p1);
+                                    move_dynamic_tensor_coordinate(
+                                        dst_desc, dst_slice_origin_, dst_step_0_0_0_p1);
+                                }
+                                else
+                                {
+                                    move_dynamic_tensor_coordinate(
+                                        src_desc, src_slice_origin_, src_step_0_0_0_m1);
+                                    move_dynamic_tensor_coordinate(
+                                        dst_desc, dst_slice_origin_, dst_step_0_0_0_m1);
+                                }
+                            }
+                        }
+                        // move dim1 iterator
+                        if(iter2 < Len2 - 1)
+                        {
+                            bool forward_dim2 = (iter1 % 2 == 0);
+                            if(forward_dim2)
+                            {
+                                move_dynamic_tensor_coordinate(
+                                    src_desc, src_slice_origin_, src_step_0_0_p1_0);
+                                move_dynamic_tensor_coordinate(
+                                    dst_desc, dst_slice_origin_, dst_step_0_0_p1_0);
+                            }
+                            else
+                            {
+                                move_dynamic_tensor_coordinate(
+                                    src_desc, src_slice_origin_, src_step_0_0_m1_0);
+                                move_dynamic_tensor_coordinate(
+                                    dst_desc, dst_slice_origin_, dst_step_0_0_m1_0);
+                            }
+                        }
+                    }
+                    // move dim1 iterator
+                    if(iter1 < Len1 - 1)
+                    {
+                        bool forward_dim1 = (iter0 % 2 == 0);
+                        if(forward_dim1)
+                        {
+                            move_dynamic_tensor_coordinate(
+                                src_desc, src_slice_origin_, src_step_0_p1_0_0);
+                            move_dynamic_tensor_coordinate(
+                                dst_desc, dst_slice_origin_, dst_step_0_p1_0_0);
+                        }
+                        else
+                        {
+                            move_dynamic_tensor_coordinate(
+                                src_desc, src_slice_origin_, src_step_0_m1_0_0);
+                            move_dynamic_tensor_coordinate(
+                                dst_desc, dst_slice_origin_, dst_step_0_m1_0_0);
+                        }
+                    }
+                }
+                // move dim0 iterator:
+                if(iter0 < Len0 - 1)
+                {
+                    // move forward in dim0
+                    move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0_0_0);
+                    move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0_0_0);
+                }
+            }
+        }
+        // move src and dst coordinate back to their origins
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_back_step =
+                make_dynamic_tensor_coordinate_step(src_desc, GetCoordinateBackStep());
+            move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step);
+        }
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_back_step =
+                make_dynamic_tensor_coordinate_step(dst_desc, GetCoordinateBackStep());
+            move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_back_step);
+        }
    }
    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
@@ -591,132 +891,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
        }
    }
-    __device__ static constexpr auto GetCoordinateBackStep()
+    __device__ void RunRead_hack(const SrcDesc& src_desc, const SrcData* p_src)
-    {
-        MultiIndex<nDim> back_step;
-        back_step(Number<0>{}) = 1 - SliceLengths{}[0];
-        static_for<1, nDim, 1>{}([&](auto i) {
-            back_step(i) = (SliceLengths{}[i - Number<1>{}] % 2 == 0) ? 0 : (1 - SliceLengths{}[i]);
-        });
-        return back_step;
-    }
-    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
-                                       const Index& src_slice_origin_step_idx)
-    {
-        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx = SrcResetCoordinateAfterRun
-                                           ? src_slice_origin_step_idx
-                                           : src_slice_origin_step_idx + GetCoordinateBackStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_dynamic_tensor_coordinate_step(src_desc, adjusted_step_idx);
-        move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step);
-    }
-    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
-                                       const Index& dst_slice_origin_step_idx)
-    {
-        // if dst coord was not reset by RunWrite(), then need to adjust the step here
-        const auto adjusted_step_idx = DstResetCoordinateAfterRun
-                                           ? dst_slice_origin_step_idx
-                                           : dst_slice_origin_step_idx + GetCoordinateBackStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_dynamic_tensor_coordinate_step(dst_desc, adjusted_step_idx);
-        move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, adjusted_step);
-    }
-    private:
-    static constexpr auto buffer_desc_ =
-        make_dynamic_naive_tensor_descriptor_packed<nDim>(to_multi_index(SliceLengths{}));
-    static constexpr index_t buffer_size_ = buffer_desc_.GetElementSpaceSize();
-    StaticallyIndexedArray<SrcData, buffer_size_> buffer_;
-    SrcCoord src_slice_origin_;
-    DstCoord dst_slice_origin_;
-};
-// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
-// and sometimes useless instructions
-// 1. It does not keep reference to tensor descriptor
-// 2. It does not construct new tensor coordinate for this->Run()
-// 3. It does not use pointer for VGPR thread buffer
-// 4. It calculate offset for thread buffer directly, instead of moving the coordinate
-template <typename SliceLengths,
-          InMemoryDataOperation DstInMemOp,
-          typename SrcData,
-          typename DstData,
-          typename SrcDesc,
-          typename DstDesc,
-          typename SrcDimAccessOrder,
-          typename DstDimAccessOrder,
-          index_t SrcVectorDim,
-          index_t DstVectorDim,
-          index_t SrcScalarPerVector,
-          index_t DstScalarPerVector,
-          index_t SrcScalarStrideInVector,
-          index_t DstScalarStrideInVector,
-          AddressSpace SrcAddressSpace,
-          AddressSpace DstAddressSpace,
-          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
-                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
-                                           // save addr computation
-          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
-                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
-                                           // save addr computation
-struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
-{
-    static constexpr index_t nDim = SliceLengths::Size();
-    using Index                   = MultiIndex<nDim>;
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
-    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
-    using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
-    using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3_hack(const SrcDesc& src_desc,
-                                                                      const Index& src_slice_origin,
-                                                                      const DstDesc& dst_desc,
-                                                                      const Index& dst_slice_origin)
-        : src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
-          dst_slice_origin_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
-    {
-        static_assert(SrcAddressSpace == AddressSpace::Global or
-                          SrcAddressSpace == AddressSpace::Lds,
-                      "wrong!");
-        static_assert(DstAddressSpace == AddressSpace::Global or
-                          DstAddressSpace == AddressSpace::Lds,
-                      "wrong!");
-    }
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3_hack()
-        : ThreadwiseDynamicTensorSliceTransfer_v3_hack(
-              SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
-    {
-    }
-    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
-    {
-        src_slice_origin_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
-    }
-    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
-    {
-        dst_slice_origin_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
-    }
-    __device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src)
    {
        static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2,
                      "wrong! hardcoded for 2D tensor");
@@ -736,44 +911,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
            const auto src_step_m1_0 =
                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
-#elif 0
-            const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc,
-                make_multi_index(0, 1),
-                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
-            const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc,
-                make_multi_index(0, -1),
-                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
-            const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc,
-                make_multi_index(1, 0),
-                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
-            const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc,
-                make_multi_index(-1, 0),
-                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
 #elif 1
            // for padded input tensor
            const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc,
+                src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
-                make_multi_index(0, 1),
-                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
            const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc,
+                src_desc, make_multi_index(0, -1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{});
-                make_multi_index(0, -1),
-                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{});
            const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc,
+                src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
-                make_multi_index(1, 0),
-                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
            const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc,
+                src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
-                make_multi_index(-1, 0),
+#elif 0
-                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
-#elif 1
            // for non-padded input tensor
            const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
                src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{});
@@ -806,12 +955,23 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
                    static_assert(SrcAddressSpace == AddressSpace::Global,
                                  "wrong! hardcoded to use buffer_load, src must be global mem");
+#if 0 // debug
                    buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>(
                        p_src,
                        src_slice_origin_.GetOffset(),
                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
                            src_desc, src_slice_origin_),
                        src_desc.GetElementSpaceSize());
+#else
+                    SrcData tmp = amd_buffer_load<SrcData, 1>(
+                        p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
+                    const bool is_valid =
+                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                            src_desc, src_slice_origin_);
+                    buffer_(Number<buffer_offset>{}) = is_valid ? tmp : SrcData{0};
+#endif
                    // move dim1 iterator
                    if constexpr(iter1.value < Len1 - 1)
@@ -847,83 +1007,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
        }
    }
-    __device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst)
-    {
-        static_assert(remove_reference_t<DstDesc>::GetNumOfDimension() == 2,
-                      "wrong! hardcoded for 2D tensor");
-        // hardcoded for 2D
-        // TODO implement N-D
-        if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
-        {
-            // TODO use constexpr for coordinate-step to make sure compiler behave correctly
-            const auto dst_step_0_p1 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
-            const auto dst_step_0_m1 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
-            const auto dst_step_p1_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
-            const auto dst_step_m1_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
-            constexpr index_t Len0 = SliceLengths{}[0];
-            constexpr index_t Len1 = SliceLengths{}[1];
-            static_for<0, Len0, 1>{}([&](auto iter0) {
-                static_for<0, Len1, 1>{}([&](auto iter1) {
-                    // step direction
-                    constexpr bool forward_dim1 = (iter0.value % 2 == 0);
-                    constexpr index_t i0 = iter0;
-                    constexpr index_t i1 = forward_dim1 ? iter1.value : Len1 - iter1.value - 1;
-                    // do work
-                    constexpr index_t buffer_offset =
-                        buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
-                    // hardcoding for ds_write
-                    // TODO refactor transfer_data() to encapsulate this
-                    static_assert(DstAddressSpace == AddressSpace::Lds &&
-                                      DstInMemOp == InMemoryDataOperation::Set,
-                                  "wrong! hardcoded for ds_write");
-                    p_dst[dst_slice_origin_.GetOffset()] = buffer_[Number<buffer_offset>{}];
-                    // move dim1 iterator
-                    if constexpr(iter1.value < Len1 - 1)
-                    {
-                        if constexpr(forward_dim1)
-                        {
-                            move_dynamic_tensor_coordinate(
-                                dst_desc, dst_slice_origin_, dst_step_0_p1);
-                        }
-                        else
-                        {
-                            move_dynamic_tensor_coordinate(
-                                dst_desc, dst_slice_origin_, dst_step_0_m1);
-                        }
-                    }
-                });
-                // move dim0 iterator
-                if constexpr(iter0.value < Len0 - 1)
-                {
-                    move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0);
-                }
-            });
-        }
-        // move dst coordinate back to its slice origin
-        if constexpr(DstResetCoordinateAfterRun)
-        {
-            const auto dst_back_step =
-                make_dynamic_tensor_coordinate_step(dst_desc, GetCoordinateBackStep());
-            move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_back_step);
-        }
-    }
    __device__ static constexpr auto GetCoordinateBackStep()
    {
        MultiIndex<nDim> back_step;
@@ -947,18 +1030,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
                                           : src_slice_origin_step_idx + GetCoordinateBackStep();
        // is it OK to construct a new step every time?
-#if 0 // hack
+        const auto adjusted_step = make_dynamic_tensor_coordinate_step(src_desc, adjusted_step_idx);
-        const auto adjusted_step = make_dynamic_tensor_coordinate_step(
-            src_desc, adjusted_step_idx);
-#elif 1
-        // for padded input tensor
-        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
-            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
-#elif 1
-        // for non-paded input tensor
-        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
-            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});
-#endif
        move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step);
    }
@@ -978,6 +1050,32 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
        move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, adjusted_step);
    }
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow_hack(const SrcDesc& src_desc,
+                                            const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx = SrcResetCoordinateAfterRun
+                                           ? src_slice_origin_step_idx
+                                           : src_slice_origin_step_idx + GetCoordinateBackStep();
+        // is it OK to construct a new step every time?
+#if 0 // hack
+        const auto adjusted_step = make_dynamic_tensor_coordinate_step(
+            src_desc, adjusted_step_idx);
+#elif 1
+        // for padded input tensor
+        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
+            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
+#elif 0
+        // for non-paded input tensor
+        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
+            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});
+#endif
+        move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step);
+    }
    private:
    static constexpr auto buffer_desc_ =
        make_dynamic_naive_tensor_descriptor_packed<nDim>(to_multi_index(SliceLengths{}));
@@ -989,5 +1087,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
    SrcCoord src_slice_origin_;
    DstCoord dst_slice_origin_;
 };
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -152,10 +152,15 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
    return __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
 #else
+#if 1 // debug
    float tmp = __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
    return src_thread_data_valid ? tmp : float(0);
+#else
+    return __llvm_amdgcn_buffer_load_f32(
+        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
+#endif
 #endif
 }

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -87,7 +87,7 @@
 // thread-invariant, otherwise it's a bug
 // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
 #ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
-#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
+#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 1
 #endif
 // workaround: put all workaround here

--- a/composable_kernel/include/utility/sequence.hpp
+++ b/composable_kernel/include/utility/sequence.hpp
@@ -750,6 +750,13 @@ __host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce,
    return typename sequence_reverse_inclusive_scan<Seq, Reduce, Init>::type{};
 }
+template <typename Seq, typename Reduce, index_t Init>
+__host__ __device__ constexpr auto reverse_exclusive_scan_sequence(Seq, Reduce, Number<Init>)
+{
+    return reverse_inclusive_scan_sequence(Seq::PopFront(), Reduce{}, Number<Init>{})
+        .PushBack(Number<Init>{});
+}
 template <typename Seq, typename Reduce, index_t Init>
 __host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<Init>)
 {

--- a/driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -155,6 +155,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
 #elif 1
    // cdata = 64, BlockSize = 256, 128x128x8
+    // b threadwise copy 4x1
    constexpr index_t BlockSize = 256;
    constexpr index_t GemmMPerBlock = 128;
@@ -185,6 +186,40 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN  = 1;
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
+    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
+#elif 1
+    // cdata = 64, BlockSize = 256, 128x128x8
+    // b threadwise copy 2x2
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 8;
+    constexpr index_t GemmMPerThread = 4;
+    constexpr index_t GemmNPerThread = 4;
+    constexpr index_t GemmKPerThread = 1;
+    constexpr index_t GemmMLevel0Cluster = 2;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 8;
+    constexpr index_t GemmNLevel1Cluster = 8;
+    constexpr index_t ThreadGemmDataPerReadM = 4;
+    constexpr index_t ThreadGemmDataPerReadN = 4;
+    using GemmABlockCopyThreadSliceLengths_GemmK_GemmM   = Sequence<4, 1>;
+    using GemmABlockCopyThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
+    constexpr index_t GemmABlockCopySrcDataPerRead_GemmK  = 4;
+    constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM = 1;
+    using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN   = Sequence<2, 2>;
+    using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<4, 64>;
+    constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN  = 1;
+    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
 #elif 1
    // cdata = 64, BlockSize = 256, 128x128x8

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -41,12 +41,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-    const auto in_n_c_hi_wi_desc = make_dynamic_naive_tensor_descriptor<4>(
+    // assume packed tensor
-        to_multi_index(InDesc::GetLengths()), to_multi_index(InDesc::GetStrides()));
+    const auto in_n_c_hi_wi_desc =
-    const auto wei_k_c_y_x_desc = make_dynamic_naive_tensor_descriptor<4>(
+        make_dynamic_naive_tensor_descriptor_packed<4>(to_multi_index(InDesc::GetLengths()));
-        to_multi_index(WeiDesc::GetLengths()), to_multi_index(WeiDesc::GetStrides()));
+    const auto wei_k_c_y_x_desc =
-    const auto out_n_k_ho_wo_desc = make_dynamic_naive_tensor_descriptor<4>(
+        make_dynamic_naive_tensor_descriptor_packed<4>(to_multi_index(WeiDesc::GetLengths()));
-        to_multi_index(OutDesc::GetLengths()), to_multi_index(OutDesc::GetStrides()));
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed<4>(to_multi_index(OutDesc::GetLengths()));
    const auto conv_strides   = to_multi_index(ConvStrides{});
    const auto conv_dilations = to_multi_index(ConvDilations{});
@@ -115,6 +116,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
 #elif 1
    // cdata = 64, BlockSize = 256, 128x128x8
+    // b thread copy 4x1
    constexpr index_t BlockSize = 256;
    constexpr index_t GemmMPerBlock = 128;
@@ -142,6 +144,37 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 1
+    // cdata = 64, BlockSize = 256, 128x128x8
+    // b thread copy 2x2
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 8;
+    constexpr index_t GemmMPerThread = 4;
+    constexpr index_t GemmNPerThread = 4;
+    constexpr index_t GemmKPerThread = 1;
+    constexpr index_t GemmMLevel0Cluster = 2;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 8;
+    constexpr index_t GemmNLevel1Cluster = 8;
+    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<4, 1>;
+    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
+    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<2, 2>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<4, 64>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
 #endif
@@ -169,7 +202,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    constexpr auto conv_driver =
 #if 1 // debug
-        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
+        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
 #else
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
 #endif

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -217,7 +217,7 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 1
+#elif 0
    // 3x3, 35x35, stride 2
    constexpr index_t N  = 128;
    constexpr index_t C  = 288;
@@ -352,7 +352,7 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
    // 3x3, 28x28
    constexpr index_t N  = 128;
    constexpr index_t C  = 128;