improving index calculation: change to UpdateIndexDiff()

c1ed17f8 · Chao Liu · 77c81617 · c1ed17f8 · c1ed17f8 · c1ed17f8
Commit c1ed17f8 authored Jan 17, 2021 by Chao Liu
7 changed files
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -157,7 +157,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
        const index_t GemmM0 = GemmM / GemmM1;
        const index_t GemmN0 = GemmN / GemmN1;

-#if 1 // debug
        const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
            transform_dynamic_tensor_descriptor(
                out_gemmm_gemmn_global_desc,
@@ -165,16 +164,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
                           DynamicUnMerge<2>{make_multi_index(GemmN0, GemmN1)}),
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
-#else
-        const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
-            transform_dynamic_tensor_descriptor(
-                out_gemmm_gemmn_global_desc,
-                make_tuple(
-                    HackSemiDynamicUnMerge<3, Sequence<GemmM1>>{make_multi_index(1, GemmM0)},
-                    HackSemiDynamicUnMerge<3, Sequence<GemmN1>>{make_multi_index(1, GemmN0)}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
-#endif

        // GEMM
        using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1<

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -13,11 +13,6 @@ struct DynamicPassThrough

    const UpperIndex up_lengths_;

-#if 0
-    __host__ __device__   constexpr DynamicPassThrough(const DynamicPassThrough&) = default;
-
-    __host__ __device__   constexpr DynamicPassThrough(DynamicPassThrough&&) = default;
-#else
    __host__ __device__ constexpr DynamicPassThrough(const DynamicPassThrough& other)
        : up_lengths_{other.up_lengths_}
    {
@@ -27,7 +22,6 @@ struct DynamicPassThrough
        : up_lengths_{other.up_lengths_}
    {
    }
-#endif

    __host__ __device__ constexpr DynamicPassThrough(const index_t& low_length)
        : up_lengths_{make_multi_index(low_length)}
@@ -51,31 +45,26 @@ struct DynamicPassThrough
        idx_low(Number<0>{}) = idx_up[Number<0>{}];
    }

-    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
-                                                            const UpIdxDiff& idx_diff_up,
-                                                            const LowIdx& /* idx_low_old */,
-                                                            const UpIdx& /* idx_up_old */)
-    {
-        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
-                          UpIdx::Size() == 1,
-                      "wrong! inconsistent # of dimension");
-
-        idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
-    }
-
    template <typename LowIdxDiff,
              typename UpIdxDiff,
              typename LowIdx,
              typename UpIdx,
              index_t Hack>
-    __host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
                                                     const UpIdxDiff& idx_diff_up,
-                                                                 const LowIdx& idx_low_old,
-                                                                 const UpIdx& idx_up_old,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx& idx_up_new,
                                                     Number<Hack>)
    {
-        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -103,11 +92,6 @@ struct DynamicPad
    const index_t left_pad_;
    const index_t right_pad_;

-#if 0
-    __host__ __device__   constexpr DynamicPad(const DynamicPad&) = default;
-
-    __host__ __device__   constexpr DynamicPad(DynamicPad&&) = default;
-#else
    __host__ __device__ constexpr DynamicPad(const DynamicPad& other)
        : up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
    {
@@ -117,7 +101,6 @@ struct DynamicPad
        : up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
    {
    }
-#endif

    __host__ __device__ constexpr DynamicPad(const index_t& low_length,
                                             const index_t& left_pad,
@@ -146,32 +129,26 @@ struct DynamicPad
        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
    }

-    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ static constexpr void
-    CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
-                            const UpIdxDiff& idx_diff_up,
-                            const LowIdx& /* idx_low_old */,
-                            const UpIdx& /* idx_up_old */)
-    {
-        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
-                          UpIdx::Size() == 1,
-                      "wrong! inconsistent # of dimension");
-
-        idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
-    }
-
    template <typename LowIdxDiff,
              typename UpIdxDiff,
              typename LowIdx,
              typename UpIdx,
              index_t Hack>
-    __host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
                                                     const UpIdxDiff& idx_diff_up,
-                                                                 const LowIdx& idx_low_old,
-                                                                 const UpIdx& idx_up_old,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx& idx_up_new,
                                                     Number<Hack>)
    {
-        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -199,11 +176,6 @@ struct DynamicLeftPad
    const UpperIndex up_lengths_;
    const index_t left_pad_;

-#if 0
-    __host__ __device__   constexpr DynamicLeftPad(const DynamicLeftPad&) = default;
-
-    __host__ __device__   constexpr DynamicLeftPad(DynamicLeftPad&&) = default;
-#else
    __host__ __device__ constexpr DynamicLeftPad(const DynamicLeftPad& other)
        : up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}
    {
@@ -213,7 +185,6 @@ struct DynamicLeftPad
        : up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}
    {
    }
-#endif

    __host__ __device__ constexpr DynamicLeftPad(const index_t& low_length, const index_t& left_pad)
        : up_lengths_{make_multi_index(low_length + left_pad)}, left_pad_{left_pad}
@@ -238,32 +209,26 @@ struct DynamicLeftPad
        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
    }

-    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ static constexpr void
-    CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
-                            const UpIdxDiff& idx_diff_up,
-                            const LowIdx& /* idx_low_old */,
-                            const UpIdx& /* idx_up_old */)
-    {
-        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
-                          UpIdx::Size() == 1,
-                      "wrong! inconsistent # of dimension");
-
-        idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
-    }
-
    template <typename LowIdxDiff,
              typename UpIdxDiff,
              typename LowIdx,
              typename UpIdx,
              index_t Hack>
-    __host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
                                                     const UpIdxDiff& idx_diff_up,
-                                                                 const LowIdx& idx_low_old,
-                                                                 const UpIdx& idx_up_old,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx& idx_up_new,
                                                     Number<Hack>)
    {
-        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -291,11 +256,6 @@ struct DynamicRightPad
    const index_t low_length_;
    const index_t right_pad_;

-#if 0
-    __host__ __device__   constexpr DynamicRightPad(const DynamicRightPad&) = default;
-
-    __host__ __device__   constexpr DynamicRightPad(DynamicRightPad&&) = default;
-#else
    __host__ __device__ constexpr DynamicRightPad(const DynamicRightPad& other)
        : up_lengths_{other.up_lengths_},
          low_length_{other.low_length_},
@@ -309,7 +269,6 @@ struct DynamicRightPad
          right_pad_{other.right_pad_}
    {
    }
-#endif

    __host__ __device__ constexpr DynamicRightPad(const index_t& low_length,
                                                  const index_t& right_pad)
@@ -339,32 +298,26 @@ struct DynamicRightPad
        idx_low(Number<0>{}) = idx_up[Number<0>{}];
    }

-    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ static constexpr void
-    CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
-                            const UpIdxDiff& idx_diff_up,
-                            const LowIdx& /* idx_low_old */,
-                            const UpIdx& /* idx_up_old */)
-    {
-        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
-                          UpIdx::Size() == 1,
-                      "wrong! inconsistent # of dimension");
-
-        idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
-    }
-
    template <typename LowIdxDiff,
              typename UpIdxDiff,
              typename LowIdx,
              typename UpIdx,
              index_t Hack>
-    __host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
                                                     const UpIdxDiff& idx_diff_up,
-                                                                 const LowIdx& idx_low_old,
-                                                                 const UpIdx& idx_up_old,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx& idx_up_new,
                                                     Number<Hack>)
    {
-        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -392,11 +345,6 @@ struct DynamicEmbed
    const UpperIndex up_lengths_;
    const UpperIndex coefficients_;

-#if 0
-    __host__ __device__   constexpr DynamicEmbed(const DynamicEmbed&) = default;
-
-    __host__ __device__   constexpr DynamicEmbed(DynamicEmbed&&) = default;
-#else
    __host__ __device__ constexpr DynamicEmbed(const DynamicEmbed& other)
        : up_lengths_{other.up_lengths_}, coefficients_{other.coefficients_}
    {
@@ -406,7 +354,7 @@ struct DynamicEmbed
        : up_lengths_{other.up_lengths_}, coefficients_{other.coefficients_}
    {
    }
-#endif
+
    __host__ __device__ constexpr DynamicEmbed(const UpperIndex& up_lengths,
                                               const UpperIndex& coefficients)
        : up_lengths_{up_lengths}, coefficients_{coefficients}
@@ -447,11 +395,16 @@ struct DynamicEmbed
        });
    }

-    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
                                              const UpIdxDiff& idx_diff_up,
-                                                               const LowIdx& /* idx_low_old */,
-                                                               const UpIdx& /* idx_up_old */) const
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
    {
        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp &&
                          LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
@@ -461,20 +414,8 @@ struct DynamicEmbed

        static_for<0, NDimUp, 1>{}(
            [&](auto i) { idx_diff_low(Number<0>{}) += idx_diff_up[i] * coefficients_[i]; });
-    }

-    template <typename LowIdxDiff,
-              typename UpIdxDiff,
-              typename LowIdx,
-              typename UpIdx,
-              index_t Hack>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
-                                                                    const UpIdxDiff& idx_diff_up,
-                                                                    const LowIdx& idx_low_old,
-                                                                    const UpIdx& idx_up_old,
-                                                                    Number<Hack>) const
-    {
-        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+        idx_low += idx_diff_low;
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -502,11 +443,6 @@ struct DynamicMerge
    const LowerIndex low_lengths_scan_;
    const UpperIndex up_lengths_;

-#if 0
-    __host__ __device__   constexpr DynamicMerge(const DynamicMerge&) = default;
-
-    __host__ __device__   constexpr DynamicMerge(DynamicMerge&&) = default;
-#else
    __host__ __device__ constexpr DynamicMerge(const DynamicMerge& other)
        : low_lengths_{other.low_lengths_},
          low_lengths_scan_{other.low_lengths_scan_},
@@ -520,7 +456,6 @@ struct DynamicMerge
          up_lengths_{other.up_lengths_}
    {
    }
-#endif

    __host__ __device__ constexpr DynamicMerge(const LowerIndex& low_lengths)
        : low_lengths_{low_lengths},
@@ -562,101 +497,15 @@ struct DynamicMerge
        idx_low(Number<NDimLow - 1>{}) = tmp;
    }

-    // idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
-    // If idx_diff_up is known at compile-time, many calculations can be optimized
-    // away by compiler
-    // This function assume idx_low_old is not out-of-bound
-    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
-                                                               const UpIdxDiff& idx_diff_up,
-                                                               const LowIdx& idx_low_old,
-                                                               const UpIdx& /* idx_up_old */) const
-    {
-        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
-                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
-                      "wrong! inconsistent # of dimension");
-
-        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
-        // However,
-        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
-        //   can be calculated at compile-time.
-        //   2) If idx_diff_up is not known at compile-time, but its value
-        //   doesn't change during the whole kernel execution, then
-        //   idx_diff_low_const also
-        //   doesn't change during the whole kernel execution. Compiler generated
-        //   ISA should
-        //   only caclculate idx_diff_low_const once and save it durinng the whole
-        //   kernel execution
-        // If neither 1) nor 2) is satisfied, then the calculation will also be
-        // computed at
-        //   run-time each time this function is called, and can be very expensive.
-        LowerIndex idx_diff_low_const;
-#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
-        CalculateLowerIndex(idx_diff_low_const, idx_diff_up);
-#else
-        index_t tmp = idx_diff_up[Number<0>{}];
-
-        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
-            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
-            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
-        });
-
-        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
-        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
-#endif
-
-        // do carry check on each low dimension in reversed order
-        // do not need to check the first dimension
-        index_t carry = 0;
-
-        static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
-            // this should be saved in SGPR as well
-            index_t idx_low_length_minus_idx_diff_low_const =
-                low_lengths_[i] - idx_diff_low_const[i];
-#if 1
-            index_t idx_low_length_plus_idx_diff_low_const =
-                low_lengths_[i] + idx_diff_low_const[i];
-#endif
-
-            index_t idx_low_tmp = idx_low_old[i] + carry;
-
-            bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const;
-#if 1
-            bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
-#endif
-
-            idx_diff_low(i) =
-                do_carry ? -idx_low_length_minus_idx_diff_low_const : idx_diff_low_const[i];
-#if 1
-            idx_diff_low(i) = do_borrow ? idx_low_length_plus_idx_diff_low_const : idx_diff_low[i];
-#endif
-
-            idx_diff_low(i) += carry;
-
-            carry = do_carry ? 1 : 0;
-#if 1
-            carry = do_borrow ? -1 : carry;
-#endif
-        });
-
-        idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
-    }
-
-    // idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
-    //
-    // If idx_diff_up is known at compile-time, many calculations can be optimized
-    // away by compiler
-    // This function assume idx_low_old is not out-of-bound
-    // this version save computation but use more register
    template <typename LowIdxDiff,
              typename UpIdxDiff,
              typename LowIdx,
              typename UpIdx,
              index_t Hack>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack_1(LowIdxDiff& idx_diff_low,
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
                                              const UpIdxDiff& idx_diff_up,
-                                                                      const LowIdx& idx_low_old,
-                                                                      const UpIdx& /* idx_up_old */,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
                                              Number<Hack>) const
    {
        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
@@ -675,11 +524,9 @@ struct DynamicMerge
        //   only caclculate idx_diff_low_const once and save it durinng the whole
        //   kernel execution
        // If neither 1) nor 2) is satisfied, then the calculation will also be
-        // computed at
-        //   run-time each time this function is called, and can be very expensive.
+        //   computed at run-time each time this function is called, and can be
+        //   very expensive.
        LowerIndex idx_diff_low_const;
-        LowerIndex idx_low_length_minus_idx_diff_low_const;
-        LowerIndex idx_low_length_plus_idx_diff_low_const;

 #if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
        index_t tmp = idx_diff_up[Number<0>{}];
@@ -690,12 +537,6 @@ struct DynamicMerge
        });

        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
-
-        static_for<0, NDimLow, 1>{}([&](auto i) {
-            idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
-
-            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
-        });
 #else
        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
        index_t tmp = idx_diff_up[Number<0>{}];
@@ -706,389 +547,70 @@ struct DynamicMerge
        });

        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
-
-        static_for<0, NDimLow, 1>{}([&](auto i) {
-            idx_low_length_minus_idx_diff_low_const(i) =
-                __builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
-
-            idx_low_length_plus_idx_diff_low_const(i) =
-                __builtin_amdgcn_readfirstlane(low_lengths_[i] + idx_diff_low_const[i]);
-        });
 #endif

        if constexpr(Hack == 1)
        {
            // do carry check on each low dimension in reversed order
            // do not need to check the first dimension
-            index_t carry = 0;
-
-            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
-                index_t idx_low_tmp = idx_low_old[i] + carry;
-
-                bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
-
-                idx_diff_low(i) =
-                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
-
-                idx_diff_low(i) += carry;
-
-                carry = do_carry ? 1 : 0;
-            });
-
-            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
-        }
-        else if constexpr(Hack == 2)
-        {
-            // do carry check on each low dimension in reversed order
-            // do not need to check the first dimension
-            index_t borrow = 0;
+            bool do_carry = 0;

            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
-                index_t idx_low_tmp = idx_low_old[i] - borrow;
-
-                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
-
-                idx_diff_low(i) =
-                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) = idx_diff_low_const[i] + do_carry;

-                idx_diff_low(i) -= borrow;
+                index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];

-                borrow = do_borrow ? 1 : 0;
-            });
+                do_carry = idx_low_tmp >= low_lengths_[i];

-            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
-        }
-        else
+                // TODO: use exec-mask inline asm
+                if(do_carry)
                {
-            // do carry check on each low dimension in reversed order
-            // do not need to check the first dimension
-            index_t carry = 0;
-
-            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
-                index_t idx_low_tmp = idx_low_old[i] + carry;
-
-                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
-                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
-
-                idx_diff_low(i) =
-                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
-                idx_diff_low(i) =
-                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
-
-                idx_diff_low(i) += carry;
-
-                carry = do_carry ? 1 : 0;
-                carry = do_borrow ? -1 : carry;
-            });
-
-            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+                    idx_diff_low(i) -= low_lengths_[i];
                }
-    }
-    // idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
-    // If idx_diff_up is known at compile-time, many calculations can be optimized
-    // away by compiler
-    // This function assume idx_low_old is not out-of-bound
-    // this version use less register but more computation
-    template <typename LowIdxDiff,
-              typename UpIdxDiff,
-              typename LowIdx,
-              typename UpIdx,
-              index_t Hack>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack_2(LowIdxDiff& idx_diff_low,
-                                                                      const UpIdxDiff& idx_diff_up,
-                                                                      const LowIdx& idx_low_old,
-                                                                      const UpIdx& /* idx_up_old */,
-                                                                      Number<Hack>) const
-    {
-        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
-                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
-                      "wrong! inconsistent # of dimension");

-        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
-        // However,
-        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
-        //   can be calculated at compile-time.
-        //   2) If idx_diff_up is not known at compile-time, but its value
-        //   doesn't change during the whole kernel execution, then
-        //   idx_diff_low_const also
-        //   doesn't change during the whole kernel execution. Compiler generated
-        //   ISA should
-        //   only caclculate idx_diff_low_const once and save it durinng the whole
-        //   kernel execution
-        // If neither 1) nor 2) is satisfied, then the calculation will also be
-        // computed at
-        //   run-time each time this function is called, and can be very expensive.
-        LowerIndex idx_diff_low_const;
-        LowerIndex idx_low_length_minus_idx_diff_low_const;
-        LowerIndex idx_low_length_plus_idx_diff_low_const;
-
-#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
-        index_t tmp = idx_diff_up[Number<0>{}];
-
-        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
-            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
-            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
-        });
-
-        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
-#else
-        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
-        index_t tmp = idx_diff_up[Number<0>{}];
-
-        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
-            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
-            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+                idx_low(i) += idx_diff_low[i];
            });

-        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
-#endif
-
-        if constexpr(Hack == 1)
-        {
-            // do carry check on each low dimension in reversed order
-            // do not need to check the first dimension
-            bool do_carry = 0;
-
-            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
-                idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
-
-                index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] + do_carry;
-
-                do_carry = idx_low_tmp >= low_lengths_[i];
+            constexpr auto I0 = Number<0>{};

-                idx_diff_low(i) = do_carry ? idx_diff_low(i) - low_lengths_[i] : idx_diff_low[i];
-            });
+            idx_diff_low(I0) = idx_diff_low_const[I0] + do_carry;

-            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + do_carry;
+            idx_low(I0) += idx_diff_low[I0];
        }
        else if constexpr(Hack == 2)
        {
-            // do carry check on each low dimension in reversed order
+            // do borrow check on each low dimension in reversed order
            // do not need to check the first dimension
            bool do_borrow = 0;

            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
                idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;

-                index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] - do_borrow;
+                index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];

                do_borrow = idx_low_tmp < 0;

-                idx_diff_low(i) = do_borrow ? idx_diff_low(i) + low_lengths_[i] : idx_diff_low[i];
-            });
-
-            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - do_borrow;
-        }
-        else
+                // TODO: use exec-mask inline asm
+                if(do_borrow)
                {
-#if 0
-            // do carry check on each low dimension in reversed order
-            // do not need to check the first dimension
-            index_t carry = 0;
-
-            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
-                idx_diff_low(i) = idx_diff_low_const[i] + carry;
-
-                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
-                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
-
-                idx_diff_low(i) =
-                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
-                idx_diff_low(i) =
-                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
-
-                idx_diff_low(i) += carry;
-
-                carry = do_carry ? 1 : 0;
-                carry = do_borrow ? -1 : carry;
-            });
-
-            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
-#endif
-        }
+                    idx_diff_low(i) += low_lengths_[i];
                }

-    // idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
-    // If idx_diff_up is known at compile-time, many calculations can be optimized
-    // away by compiler
-    // This function assume idx_low_old is not out-of-bound
-    // this version use less register but more computation
-    template <typename LowIdxDiff,
-              typename UpIdxDiff,
-              typename LowIdx,
-              typename UpIdx,
-              index_t Hack>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack_3(LowIdxDiff& idx_diff_low,
-                                                                      const UpIdxDiff& idx_diff_up,
-                                                                      const LowIdx& idx_low_old,
-                                                                      const UpIdx& /* idx_up_old */,
-                                                                      Number<Hack>) const
-    {
-        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
-                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
-                      "wrong! inconsistent # of dimension");
-
-        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
-        // However,
-        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
-        //   can be calculated at compile-time.
-        //   2) If idx_diff_up is not known at compile-time, but its value
-        //   doesn't change during the whole kernel execution, then
-        //   idx_diff_low_const also
-        //   doesn't change during the whole kernel execution. Compiler generated
-        //   ISA should
-        //   only caclculate idx_diff_low_const once and save it durinng the whole
-        //   kernel execution
-        // If neither 1) nor 2) is satisfied, then the calculation will also be
-        // computed at
-        //   run-time each time this function is called, and can be very expensive.
-        LowerIndex idx_diff_low_const;
-        LowerIndex idx_low_length_minus_idx_diff_low_const;
-        LowerIndex idx_low_length_plus_idx_diff_low_const;
-
-#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
-        index_t tmp = idx_diff_up[Number<0>{}];
-
-        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
-            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
-            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
-        });
-
-        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
-#else
-        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
-        index_t tmp = idx_diff_up[Number<0>{}];
-
-        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
-            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
-            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
-        });
-
-        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
-#endif
-
-        if constexpr(Hack == 1)
-        {
-#if 1
-            // do carry check on each low dimension in reversed order
-            // do not need to check the first dimension
-            bool do_carry = 0;
-
-            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
-                idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
-
-                index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] + do_carry;
-
-                do_carry = idx_low_tmp >= low_lengths_[i];
-
-                idx_diff_low(i) = do_carry ? idx_diff_low(i) - low_lengths_[i] : idx_diff_low[i];
-            });
-
-            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + do_carry;
-#else
-            LowerIndex idx_low_new = idx_low_old;
-
-            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
-                auto i_m1 = i - Number<1>{};
-
-                int64_t exec_mask;
-                int64_t do_carry;
-
-                idx_low_new(i) = idx_diff_low_const[i] + idx_low_old[i];
-
-                asm volatile(
-                    "\n \
-                     s_mov_b64 %0, exec \n \
-                     v_cmpx_le_u32_e64 %1, %4, %2 \n \
-                     v_subrev_u32 %2, %4, %2\n \
-                     v_add_u32 %3, %3, 1\n \
-                     s_mov_b64 exec, %0\n \
-                    "
-                    : "=s"(exec_mask), "=s"(do_carry), "=v"(idx_low_new(i)), "=v"(idx_low_new(i_m1))
-                    : "s"(low_lengths_[i]), "2"(idx_low_new[i]), "3"(idx_low_new[i_m1]));
-
-                idx_diff_low(i) = idx_low_new[i] - idx_low_old[i];
+                idx_low(i) += idx_diff_low[i];
            });

            constexpr auto I0 = Number<0>{};
-            idx_low_new(I0) += idx_diff_low_const[I0];
-            idx_diff_low(I0) = idx_low_new[I0] - idx_low_old[I0];
-#endif
-        }
-        else if constexpr(Hack == 2)
-        {
-            // do carry check on each low dimension in reversed order
-            // do not need to check the first dimension
-            bool do_borrow = 0;

-            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
-                idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
+            idx_diff_low(I0) = idx_diff_low_const[I0] - do_borrow;

-                index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] - do_borrow;
-
-                do_borrow = idx_low_tmp < 0;
-
-                idx_diff_low(i) = do_borrow ? idx_diff_low(i) + low_lengths_[i] : idx_diff_low[i];
-            });
-
-            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - do_borrow;
+            idx_low(I0) += idx_diff_low[I0];
        }
        else
        {
-#if 0
-            // do carry check on each low dimension in reversed order
-            // do not need to check the first dimension
-            index_t carry = 0;
-
-            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
-                idx_diff_low(i) = idx_diff_low_const[i] + carry;
-
-                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
-                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
-
-                idx_diff_low(i) =
-                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
-                idx_diff_low(i) =
-                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
-
-                idx_diff_low(i) += carry;
-
-                carry = do_carry ? 1 : 0;
-                carry = do_borrow ? -1 : carry;
-            });
-
-            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
-#endif
+            // not implemented
        }
    }

-    template <typename LowIdxDiff,
-              typename UpIdxDiff,
-              typename LowIdx,
-              typename UpIdx,
-              index_t Hack>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
-                                                                    const UpIdxDiff& idx_diff_up,
-                                                                    const LowIdx& idx_low_old,
-                                                                    const UpIdx& idx_up_old,
-                                                                    Number<Hack>) const
-
-    {
-#if 0
-        // this version save computation but use more register
-        CalculateLowerIndexDiff_hack_1(
-            idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
-#elif 1
-        // this version use less register but more computation
-        CalculateLowerIndexDiff_hack_2(
-            idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
-#elif 1
-        // this version use less register but more computation
-        CalculateLowerIndexDiff_hack_3(
-            idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
-#endif
-    }
-
    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }

    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
@@ -1156,27 +678,20 @@ struct DynamicUnMerge
        }
    }

-    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
-                                                               const UpIdxDiff& idx_diff_up,
-                                                               const LowIdx& /* idx_low_old */,
-                                                               const UpIdx& /* idx_up_old */) const
-    {
-        CalculateLowerIndex(idx_diff_low, idx_diff_up);
-    }
-
    template <typename LowIdxDiff,
              typename UpIdxDiff,
              typename LowIdx,
              typename UpIdx,
              index_t Hack>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
                                              const UpIdxDiff& idx_diff_up,
-                                                                    const LowIdx& idx_low_old,
-                                                                    const UpIdx& idx_up_old,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
                                              Number<Hack>) const
    {
-        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+        CalculateLowerIndex(idx_diff_low, idx_diff_up);
+
+        idx_low += idx_diff_low;
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -1221,139 +736,18 @@ struct DynamicFreeze
        idx_low(Number<0>{}) = low_idx_;
    }

-    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
-                                                            const UpIdxDiff& idx_diff_up,
-                                                            const LowIdx& /* idx_low_old */,
-                                                            const UpIdx& /* idx_up_old */)
-    {
-        idx_diff_low(Number<0>{}) = index_t{Number<0>{}};
-    }
-
    template <typename LowIdxDiff,
              typename UpIdxDiff,
              typename LowIdx,
              typename UpIdx,
              index_t Hack>
-    __host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
                                                     const UpIdxDiff& idx_diff_up,
-                                                                 const LowIdx& idx_low_old,
-                                                                 const UpIdx& idx_up_old,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx& idx_up_new,
                                                     Number<Hack>)
    {
-        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
-    }
-
-    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
-
-    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
-    {
-        return true;
-    }
-
-    template <typename UpIdx>
-    __host__ __device__ static constexpr bool
-    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
-    {
-        return true;
-    }
-};
-
-#if 0
-template <index_t NDimUp, typename StaticPartialUpLengths>
-struct HackSemiDynamicUnMerge
-{
-    using LowerIndex = MultiIndex<1>;
-    using UpperIndex = MultiIndex<NDimUp>;
-
-    const UpperIndex up_lengths_;
-    const UpperIndex up_lengths_scan_;
-
-    static constexpr index_t NDimUpStatic  = StaticPartialUpLengths::Size();
-    static constexpr index_t NDimUpDynamic = NDimUp - NDimUpStatic;
-
-    const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_;
-    const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_scan_;
-
-    static constexpr auto static_partial_up_lengths_      = StaticPartialUpLengths{};
-    static constexpr auto static_partial_up_lengths_scan_ = reverse_exclusive_scan_sequence(
-        static_partial_up_lengths_, math::multiplies<index_t>(), Number<1>{});
-
-    __host__ __device__ constexpr HackSemiDynamicUnMerge(
-        const MultiIndex<NDimUpDynamic>& dynamic_partial_up_lengths)
-        : dynamic_partial_up_lengths_{dynamic_partial_up_lengths},
-          dynamic_partial_up_lengths_scan_{
-              container_reverse_exclusive_scan(dynamic_partial_up_lengths,
-                                               math::multiplies<index_t>(),
-                                               static_partial_up_lengths_scan_[Number<0>{}])}
-    {
-        static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
-                      "wrong! inconsisitent # of dimensions");
-    }
-
-    __host__ __device__ constexpr HackSemiDynamicUnMerge()
-        : up_lengths_{make_zero_multi_index<NDimUpDynamic>()},
-          up_lengths_scan_{make_zero_multi_index<NDimUpStatic>()}
-    {
-        static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
-                      "wrong! inconsisitent # of dimensions");
-    }
-
-    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
-
-    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
-
-    __host__ __device__ constexpr const auto GetUpperLengths() const
-    {
-        UpperIndex up_lengths;
-
-        static_for<0, NDimUpDynamic, 1>{}(
-            [&](auto i) { up_lengths(i) = dynamic_partial_up_lengths_[i]; });
-
-        static_for<0, NDimUpStatic, 1>{}([&](auto i) {
-            up_lengths(i + Number<NDimUpDynamic>{}) = static_partial_up_lengths_[i];
-        });
-
-        return up_lengths;
-    }
-
-    template <typename LowIdx, typename UpIdx>
-    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
-                                                           const UpIdx& idx_up) const
-    {
-        idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
-
-        static_for<0, NDimUpDynamic, 1>{}([&](auto i) {
-            idx_low(Number<0>{}) += idx_up[i] * dynamic_partial_up_lengths_scan_[i];
-        });
-
-        static_for<NDimUpDynamic, NDimUp - 1, 1>{}([&](auto i) {
-            idx_low(Number<0>{}) +=
-                idx_up[i] * static_partial_up_lengths_scan_[i + Number<NDimUpDynamic>{}];
-        });
-    }
-
-    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
-                                                               const UpIdxDiff& idx_diff_up,
-                                                               const LowIdx& /* idx_low_old */,
-                                                               const UpIdx& /* idx_up_old */) const
-    {
-        CalculateLowerIndex(idx_diff_low, idx_diff_up);
-    }
-
-    template <typename LowIdxDiff,
-              typename UpIdxDiff,
-              typename LowIdx,
-              typename UpIdx,
-              index_t Hack>
-    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
-                                                                    const UpIdxDiff& idx_diff_up,
-                                                                    const LowIdx& idx_low_old,
-                                                                    const UpIdx& idx_up_old,
-                                                                    Number<Hack>) const
-    {
-        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+        idx_diff_low(Number<0>{}) = index_t{Number<0>{}};
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -1370,7 +764,6 @@ struct HackSemiDynamicUnMerge
        return true;
    }
 };
-#endif

 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
@@ -502,27 +502,18 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
            constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
            constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);

-            const auto idx_up      = get_container_subset(idx_hidden, dims_up);
+            const auto idx_up_new  = get_container_subset(idx_hidden, dims_up);
            auto idx_low           = get_container_subset(idx_hidden, dims_low);
            const auto idx_diff_up = get_container_subset(idx_diff_hidden, dims_up);

            MultiIndex<dims_low.Size()> idx_diff_low;

-            // calculate idx_diff_low
-#if 0 // hack
-            tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
-#else
-            // HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
+            // HACK: control UpdateLowerIndex for DynamicMerge using hack
            // TODO remove hack
            constexpr index_t Hack =
                decltype(coord_step.hack_calculate_lower_index_diff_)::At(itran);

-            tran.CalculateLowerIndexDiff_hack(
-                idx_diff_low, idx_diff_up, idx_low, idx_up, Number<Hack>{});
-#endif
-
-            // update idx_low
-            idx_low += idx_diff_low;
+            tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});

            set_container_subset(idx_diff_hidden, dims_low, idx_diff_low);
            set_container_subset(idx_hidden, dims_low, idx_low);

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
@@ -384,13 +384,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
                Float,
                decltype(c_m0_m1_n0_n1_thread_desc),
                decltype(c_m0_m1_n0_n1_global_desc),
-#if 1 // debug
                Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
                CThreadTransferSrcDstAccessOrder,
-#else
-                Sequence<1, 1, 2, 4>,
-                Sequence<0, 1, 2, 3>,
-#endif
                CThreadTransferSrcDstVectorDim,
                1,
                CThreadTransferDstScalarPerVector,

--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -922,7 +922,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
            const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
                src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
-#elif 0
+#elif 1
            // for non-padded input tensor
            const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
                src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{});
@@ -1067,7 +1067,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
        // for padded input tensor
        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
-#elif 0
+#elif 1
        // for non-paded input tensor
        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});

--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -152,7 +152,7 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
    return __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
 #else
-#if 1 // debug
+#if 0 // debug
    float tmp = __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);


--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

    constexpr auto conv_driver =
-#if 1 // debug
+#if 1
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
 #else
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad