adding vector load

44ddcdcb · Chao Liu · c1ed17f8 · 44ddcdcb · 44ddcdcb · 44ddcdcb
Commit 44ddcdcb authored Jan 18, 2021 by Chao Liu
8 changed files
--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -502,11 +502,154 @@ struct DynamicMerge
              typename LowIdx,
              typename UpIdx,
              index_t Hack>
-    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+    __host__ __device__ void UpdateLowerIndex_1(LowIdxDiff& idx_diff_low,
-                                              const UpIdxDiff& idx_diff_up,
+                                                const UpIdxDiff& idx_diff_up,
-                                              LowIdx& idx_low,
+                                                LowIdx& idx_low,
-                                              const UpIdx& idx_up_new,
+                                                const UpIdx& /* idx_up_new */,
-                                              Number<Hack>) const
+                                                Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_diff_low_const;
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
+            idx_low_length_plus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] + idx_diff_low_const[i]);
+        });
+#endif
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+                bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+            idx_low += idx_diff_low;
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t borrow = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] - borrow;
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) -= borrow;
+                borrow = do_borrow ? 1 : 0;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
+            idx_low += idx_diff_low;
+        }
+        else
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
+                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
+                carry = do_borrow ? -1 : carry;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+            idx_low += idx_diff_low;
+        }
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex_2(LowIdxDiff& idx_diff_low,
+                                                const UpIdxDiff& idx_diff_up,
+                                                LowIdx& idx_low,
+                                                const UpIdx& /* idx_up_new */,
+                                                Number<Hack>) const
    {
        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
@@ -611,6 +754,24 @@ struct DynamicMerge
        }
    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+#if 1
+        UpdateLowerIndex_1(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#else
+        UpdateLowerIndex_2(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#endif
+    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
@@ -624,7 +785,7 @@ struct DynamicMerge
    {
        return true;
    }
-};
+}; // namespace ck
 template <index_t NDimUp, bool Use24BitIntegerCalculation = false>
 struct DynamicUnMerge

--- a/composable_kernel/include/tensor_description/multi_index.hpp
+++ b/composable_kernel/include/tensor_description/multi_index.hpp
@@ -152,6 +152,17 @@ __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
    return r;
 }
+// MultiIndex = index_t * MultiIndex
+template <typename... Xs>
+__host__ __device__ constexpr auto operator*(index_t a, const Tuple<Xs...>& x)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a * x[i]; });
+    return r;
+}
 #endif
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -735,74 +735,148 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
    __device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src)
    {
+        // hardcoded for 2D
+        // TODO implemente N-D
        static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2,
                      "wrong! hardcoded for 2D tensor");
-        // hardcoded for 2D
+        constexpr auto src_scalar_per_access = [&]() {
-        // TODO implemente N-D
+            Index src_scalar_per_access;
-        if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
-        {
-            // TODO use constexpr for coordinate-step to make sure compiler behave correctly
-            const auto src_step_0_p1 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
-            const auto src_step_0_m1 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
-            const auto src_step_p1_0 =
+            static_for<0, nDim, 1>{}([&](auto i) {
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
+                if constexpr(i == SrcVectorDim)
-            const auto src_step_m1_0 =
+                {
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
+                    src_scalar_per_access(i) = SrcScalarPerVector * SrcScalarStrideInVector;
+                }
+                else
+                {
+                    src_scalar_per_access(i) = 1;
+                }
+            });
-            constexpr index_t Len0 = SliceLengths{}[0];
+            return src_scalar_per_access;
-            constexpr index_t Len1 = SliceLengths{}[1];
+        }();
-            static_for<0, Len0, 1>{}([&](auto iter0) {
+        constexpr auto src_scalar_step_in_vector = [&]() {
-                static_for<0, Len1, 1>{}([&](auto iter1) {
+            Index src_scalar_step_in_vector;
-                    // step direction
-                    constexpr bool forward_dim1 = (iter0.value % 2 == 0);
-                    constexpr index_t i0 = iter0.value;
+            static_for<0, nDim, 1>{}([&](auto i) {
-                    constexpr index_t i1 = forward_dim1 ? iter1.value : Len1 - iter1.value - 1;
+                if constexpr(i == SrcVectorDim)
+                {
+                    src_scalar_step_in_vector(i) = 1;
+                }
+                else
+                {
+                    src_scalar_step_in_vector(i) = 0;
+                }
+            });
-                    // do work
+            return src_scalar_step_in_vector;
-                    constexpr index_t buffer_offset =
+        }();
-                        buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
-                    // hardcoding for buffer_load
+        constexpr auto access_lengths = [&]() {
-                    // TODO refactor transfer_data() to encapsulate this
+            Index access_lengths;
-                    static_assert(SrcAddressSpace == AddressSpace::Global,
-                                  "wrong! hardcoded to use buffer_load, src must be global mem");
-                    buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>(
+            static_for<0, nDim, 1>{}(
-                        p_src,
+                [&](auto i) { access_lengths(i) = SliceLengths{}[i] / src_scalar_per_access[i]; });
-                        src_slice_origin_.GetOffset(),
-                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                            src_desc, src_slice_origin_),
-                        src_desc.GetElementSpaceSize());
-                    // move dim1 iterator
+            return access_lengths;
-                    if constexpr(iter1.value < Len1 - 1)
+        }();
-                    {
-                        if constexpr(forward_dim1)
+        // TODO use constexpr for coordinate-step to make sure compiler behave correctly
-                        {
+        const auto src_step_0_p1 =
-                            move_dynamic_tensor_coordinate(
+            make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
-                                src_desc, src_slice_origin_, src_step_0_p1);
+        const auto src_step_0_m1 =
-                        }
+            make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
-                        else
-                        {
+        const auto src_step_p1_0 =
-                            move_dynamic_tensor_coordinate(
+            make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
-                                src_desc, src_slice_origin_, src_step_0_m1);
+        const auto src_step_m1_0 =
-                        }
+            make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
-                    }
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        static_for<0, access_lengths[I0], 1>{}([&](auto iter0) {
+            static_for<0, access_lengths[I1], 1>{}([&](auto iter1) {
+                // step direction
+                constexpr bool forward_dim1 = (iter0.value % 2 == 0);
+                constexpr index_t i0 = iter0.value;
+                constexpr index_t i1 =
+                    forward_dim1 ? iter1.value : access_lengths[I1] - iter1.value - 1;
+                // do work
+                // hardcoding for buffer_load
+                // TODO refactor transfer_data() to encapsulate this
+                static_assert(SrcAddressSpace == AddressSpace::Global,
+                              "wrong! hardcoded to use buffer_load, src must be global mem");
+#if 1 // only works for SrcScalarPerVector == 1
+                auto src_data = amd_buffer_load<SrcData, 1>(
+                    p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
+                const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                    src_desc, src_slice_origin_);
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
+                buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
+#elif 1 // only works for SrcScalarPerVector == 1
+                auto src_data = amd_buffer_load<SrcData, 1>(
+                    p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
+                const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                    src_desc, src_slice_origin_);
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(make_multi_index(i0, i1) * src_scalar_per_access);
+                buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
+#else
+                vector_type<SrcData, SrcScalarPerVector> src_vector;
+                using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
+                src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
+                    p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
+                const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                    src_desc, src_slice_origin_);
+                src_vector.Vector() = is_valid ? src_vector.Vector() : SrcVectorType{0};
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
+                        make_multi_index(i0, i1) * src_scalar_per_access +
+                        i * src_scalar_step_in_vector);
+                    // TODO: can buffe_ use vector access?
+                    buffer_(Number<buffer_offset>{}) = src_vector[i];
                });
+#endif
-                // move dim0 iterator
+                // move dim1 iterator
-                if constexpr(iter0.value < Len0 - 1)
+                if constexpr(iter1.value < access_lengths[I1] - 1)
                {
-                    move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
+                    if constexpr(forward_dim1)
+                    {
+                        move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_p1);
+                    }
+                    else
+                    {
+                        move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_m1);
+                    }
                }
            });
-        }
+            // move dim0 iterator
+            if constexpr(iter0.value < access_lengths[I0] - 1)
+            {
+                move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
+            }
+        });
        // move src coordinate back to its slice origin
        if constexpr(SrcResetCoordinateAfterRun)
@@ -893,13 +967,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
    __device__ void RunRead_hack(const SrcDesc& src_desc, const SrcData* p_src)
    {
+        // hardcoding for buffer_load
+        // TODO refactor transfer_data() to encapsulate this
        static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2,
                      "wrong! hardcoded for 2D tensor");
-        // hardcoded for 2D
+        constexpr auto src_scalar_per_access = [&]() {
-        // TODO implemente N-D
+            Index src_scalar_per_access;
-        if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
-        {
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    src_scalar_per_access(i) = SrcScalarPerVector * SrcScalarStrideInVector;
+                }
+                else
+                {
+                    src_scalar_per_access(i) = 1;
+                }
+            });
+            return src_scalar_per_access;
+        }();
+        constexpr auto src_scalar_step_in_vector = [&]() {
+            Index src_scalar_step_in_vector;
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    src_scalar_step_in_vector(i) = 1;
+                }
+                else
+                {
+                    src_scalar_step_in_vector(i) = 0;
+                }
+            });
+            return src_scalar_step_in_vector;
+        }();
+        constexpr auto access_lengths = [&]() {
+            Index access_lengths;
+            static_for<0, nDim, 1>{}(
+                [&](auto i) { access_lengths(i) = SliceLengths{}[i] / src_scalar_per_access[i]; });
+            return access_lengths;
+        }();
 #if 0 // hack
      // TODO use constexpr for coordinate-step to make sure compiler behave correctly
            const auto src_step_0_p1 =
@@ -911,91 +1026,102 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
            const auto src_step_m1_0 =
                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
+#elif 0
+        // for padded input tensor
+        const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
+            src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
+        const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
+            src_desc, make_multi_index(0, -1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{});
+        const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
+            src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
+        const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
+            src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
 #elif 1
-            // for padded input tensor
+        // for non-padded input tensor
-            const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
+        const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
+            src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{});
-            const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
+        const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc, make_multi_index(0, -1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{});
+            src_desc, make_multi_index(0, -1), Sequence<0, 0, 0, 0, 0, 0, 2>{});
-            const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
+        const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
+            src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 1, 0>{});
-            const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
+        const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
+            src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 2, 0>{});
-#elif 1
-            // for non-padded input tensor
-            const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{});
-            const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc, make_multi_index(0, -1), Sequence<0, 0, 0, 0, 0, 0, 2>{});
-            const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 1, 0>{});
-            const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
-                src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 2, 0>{});
 #endif
-            constexpr index_t Len0 = SliceLengths{}[0];
+        constexpr auto I0 = Number<0>{};
-            constexpr index_t Len1 = SliceLengths{}[1];
+        constexpr auto I1 = Number<1>{};
-            static_for<0, Len0, 1>{}([&](auto iter0) {
+        static_for<0, access_lengths[I0], 1>{}([&](auto iter0) {
-                static_for<0, Len1, 1>{}([&](auto iter1) {
+            static_for<0, access_lengths[I1], 1>{}([&](auto iter1) {
-                    // step direction
+                // step direction
-                    constexpr bool forward_dim1 = (iter0.value % 2 == 0);
+                constexpr bool forward_dim1 = (iter0.value % 2 == 0);
-                    constexpr index_t i0 = iter0.value;
+                constexpr index_t i0 = iter0.value;
-                    constexpr index_t i1 = forward_dim1 ? iter1.value : Len1 - iter1.value - 1;
+                constexpr index_t i1 =
+                    forward_dim1 ? iter1.value : access_lengths[I1] - iter1.value - 1;
-                    // do work
+                // do work
-                    constexpr index_t buffer_offset =
+                // hardcoding for buffer_load
-                        buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
+                // TODO refactor transfer_data() to encapsulate this
+                static_assert(SrcAddressSpace == AddressSpace::Global,
+                              "wrong! hardcoded to use buffer_load, src must be global mem");
-                    // hardcoding for buffer_load
+#if 1 // only works for SrcScalarPerVector == 1
-                    // TODO refactor transfer_data() to encapsulate this
+                auto src_data = amd_buffer_load<SrcData, 1>(
-                    static_assert(SrcAddressSpace == AddressSpace::Global,
+                    p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
-                                  "wrong! hardcoded to use buffer_load, src must be global mem");
-#if 0 // debug
+                const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                    buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>(
+                    src_desc, src_slice_origin_);
-                        p_src,
-                        src_slice_origin_.GetOffset(),
-                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                            src_desc, src_slice_origin_),
-                        src_desc.GetElementSpaceSize());
-#else
-                    SrcData tmp = amd_buffer_load<SrcData, 1>(
-                        p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
-                    const bool is_valid =
+                constexpr index_t buffer_offset =
-                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                    buffer_desc_.CalculateOffset(make_multi_index(i0, i1) * src_scalar_per_access);
-                            src_desc, src_slice_origin_);
-                    buffer_(Number<buffer_offset>{}) = is_valid ? tmp : SrcData{0};
+                buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
-#endif
+#elif 1
+                vector_type<SrcData, SrcScalarPerVector> src_vector;
-                    // move dim1 iterator
+                using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
-                    if constexpr(iter1.value < Len1 - 1)
-                    {
+                src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
-                        if constexpr(forward_dim1)
+                    p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
-                        {
-                            move_dynamic_tensor_coordinate(
+                const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                                src_desc, src_slice_origin_, src_step_0_p1);
+                    src_desc, src_slice_origin_);
-                        }
-                        else
+                src_vector.Vector() = is_valid ? src_vector.Vector() : SrcVectorType{0};
-                        {
-                            move_dynamic_tensor_coordinate(
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
-                                src_desc, src_slice_origin_, src_step_0_m1);
+                    constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
-                        }
+                        make_multi_index(i0, i1) * src_scalar_per_access +
-                    }
+                        i * src_scalar_step_in_vector);
+                    // TODO: can buffe_ use vector access?
+                    buffer_(Number<buffer_offset>{}) = src_vector[i];
                });
+#endif
-                // move dim0 iterator
+                // move dim1 iterator
-                if constexpr(iter0.value < Len0 - 1)
+                if constexpr(iter1.value < access_lengths[I1] - 1)
                {
-                    move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
+                    if constexpr(forward_dim1)
+                    {
+                        move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_p1);
+                    }
+                    else
+                    {
+                        move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_m1);
+                    }
                }
            });
-        }
+            // move dim0 iterator
+            if constexpr(iter0.value < access_lengths[I0] - 1)
+            {
+                move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
+            }
+        });
        // move src coordinate back to its slice origin
        if constexpr(SrcResetCoordinateAfterRun)
@@ -1063,7 +1189,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
 #if 0 // hack
        const auto adjusted_step = make_dynamic_tensor_coordinate_step(
            src_desc, adjusted_step_idx);
-#elif 1
+#elif 0
        // for padded input tensor
        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -87,7 +87,7 @@
 // thread-invariant, otherwise it's a bug
 // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
 #ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
-#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 1
+#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
 #endif
 // workaround: put all workaround here

--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
@@ -182,11 +182,28 @@ struct vector_type<float, 1>
 {
    using MemoryType = float;
+    float data_;
+    __host__ __device__ static constexpr index_t Size() { return 1; }
+    __host__ __device__ constexpr const auto& Vector() const { return data_; }
+    __host__ __device__ constexpr auto& Vector() { return data_; }
    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    __host__ __device__ constexpr const auto& operator[](Number<I>) const
    {
-        static_assert(I < 1, "wrong");
+        static_assert(I == 0, "wrong!");
-        *(reinterpret_cast<float*>(&v) + I) = s;
+        return data_;
+    }
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I>)
+    {
+        static_assert(I == 0, "wrong!");
+        return data_;
    }
 };
@@ -222,13 +239,62 @@ struct vector_type<float, 4>
 {
    using MemoryType = float4_t;
-    __host__ __device__ static constexpr index_t GetSize() { return 4; }
+    union
+    {
+        float4_t v;
+        float s0, s1, s2, s3;
+    } data_;
+    __host__ __device__ static constexpr index_t Size() { return 4; }
+    __host__ __device__ constexpr const auto& Vector() const { return data_.v; }
+    __host__ __device__ constexpr auto& Vector() { return data_.v; }
    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    __host__ __device__ constexpr const auto& operator[](Number<I>) const
    {
-        static_assert(I < 4, "wrong");
+        static_assert(I >= 0 && I < 4, "wrong!");
-        *(reinterpret_cast<float*>(&v) + I) = s;
+        if constexpr(I == 0)
+        {
+            return data_.s0;
+        }
+        else if constexpr(I == 1)
+        {
+            return data_.s1;
+        }
+        else if constexpr(I == 2)
+        {
+            return data_.s2;
+        }
+        else
+        {
+            return data_.s3;
+        }
+    }
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I>)
+    {
+        static_assert(I >= 0 && I < 4, "wrong!");
+        if constexpr(I == 0)
+        {
+            return data_.s0;
+        }
+        else if constexpr(I == 1)
+        {
+            return data_.s1;
+        }
+        else if constexpr(I == 2)
+        {
+            return data_.s2;
+        }
+        else
+        {
+            return data_.s3;
+        }
    }
 };

--- a/driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -929,7 +929,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 2;
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
-#elif 1
+#elif 0
    // cdata = 64, BlockSize = 64, 64x64x3
    constexpr index_t BlockSize = 64;

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
    constexpr auto conv_driver =
-#if 1
+#if 0
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
 #else
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -22,7 +22,7 @@ int main(int argc, char* argv[])
 {
    using namespace ck;
-#if 0
+#if 1
    // 3x3, 35x35, stride 2
    constexpr index_t N  = 128;
    constexpr index_t C  = 192;