add hack for merge transformation

f2f35201 · Chao Liu · fa479ce4 · f2f35201 · f2f35201 · f2f35201
Commit f2f35201 authored Jan 11, 2021 by Chao Liu
8 changed files
--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -64,6 +64,20 @@ struct DynamicPassThrough
        idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+                                                                 const UpIdxDiff& idx_diff_up,
+                                                                 const LowIdx& idx_low_old,
+                                                                 const UpIdx& idx_up_old,
+                                                                 Number<Hack>)
+    {
+        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
@@ -141,6 +155,20 @@ struct DynamicLeftPad
        idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+                                                                 const UpIdxDiff& idx_diff_up,
+                                                                 const LowIdx& idx_low_old,
+                                                                 const UpIdx& idx_up_old,
+                                                                 Number<Hack>)
+    {
+        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
@@ -228,6 +256,20 @@ struct DynamicRightPad
        idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+                                                                 const UpIdxDiff& idx_diff_up,
+                                                                 const LowIdx& idx_low_old,
+                                                                 const UpIdx& idx_up_old,
+                                                                 Number<Hack>)
+    {
+        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
@@ -324,6 +366,20 @@ struct DynamicEmbed
            [&](auto i) { idx_diff_low(Number<0>{}) += idx_diff_up[i] * coefficients_[i]; });
    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+                                                                    const UpIdxDiff& idx_diff_up,
+                                                                    const LowIdx& idx_low_old,
+                                                                    const UpIdx& idx_up_old,
+                                                                    Number<Hack>) const
+    {
+        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
@@ -489,6 +545,141 @@ struct DynamicMerge
        idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
    }
+    // idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
+    // If idx_diff_up is known at compile-time, many calculations can be optimized
+    // away by compiler
+    // This function assume idx_low_old is not out-of-bound
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+                                                                    const UpIdxDiff& idx_diff_up,
+                                                                    const LowIdx& idx_low_old,
+                                                                    const UpIdx& /* idx_up_old */,
+                                                                    Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_diff_low_const;
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
+            idx_low_length_plus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] + idx_diff_low_const[i]);
+        });
+#endif
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low_old[i] + carry;
+                bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low_old[i] + carry;
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
+                idx_diff_low(i) += carry;
+                carry = do_borrow ? -1 : carry;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+        }
+        else
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low_old[i] + carry;
+                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
+                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
+                carry = do_borrow ? -1 : carry;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+        }
+    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
@@ -551,6 +742,20 @@ struct DynamicUnMerge
        CalculateLowerIndex(idx_diff_low, idx_diff_up);
    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+                                                                    const UpIdxDiff& idx_diff_up,
+                                                                    const LowIdx& idx_low_old,
+                                                                    const UpIdx& idx_up_old,
+                                                                    Number<Hack>) const
+    {
+        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
@@ -602,6 +807,20 @@ struct DynamicFreeze
        idx_diff_low(Number<0>{}) = index_t{Number<0>{}};
    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
+                                                                 const UpIdxDiff& idx_diff_up,
+                                                                 const LowIdx& idx_low_old,
+                                                                 const UpIdx& idx_up_old,
+                                                                 Number<Hack>)
+    {
+        CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
+    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
@@ -9,30 +9,13 @@ namespace ck {
 template <index_t NDimHidden, typename VisibleDimensionIds>
 struct DynamicTensorCoordinate;
+#if 0 // hack
 template <index_t NTransform, index_t NDimVisible>
+#else
+template <index_t NTransform, index_t NDimVisible, typename HackCalculateLowerIndexDiff>
+#endif
 struct DynamicTensorCoordinateStep;
-template <typename TensorDesc, typename VisibleIndex>
-__host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDesc& tensor_desc,
-                                                                  const VisibleIndex& idx_visible);
-template <typename TensorDesc, typename VisibleIndex>
-__host__ __device__ constexpr auto
-make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_diff_visible);
-template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
-__host__ __device__ constexpr void move_dynamic_tensor_coordinate(
-    const TensorDesc& tensor_desc, TensorCoord& coord, const TensorCoordStep& coord_step);
-template <typename TensorDesc, typename TensorCoord>
-__host__ __device__ constexpr bool
-coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc& tensor_desc,
-                                                            const TensorCoord& coord);
-template <typename TensorDesc, typename TensorCoord>
-__host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc& tensor_desc,
-                                                               const TensorCoord& coord);
 // Transforms: Tuple<transforms...>
 // LowerDimensionIdss : Tuple<Sequence<...>, ...>
 // UpperDimensionIdss : Tuple<Sequence<...>, ...>
@@ -74,10 +57,9 @@ struct DynamicTensorDescriptor
    constexpr static index_t ndim_visible_ = GetNumOfVisibleDimension();
    constexpr static index_t ndim_hidden_  = GetNumOfHiddenDimension();
-    using VisibleIndex   = MultiIndex<ndim_visible_>;
+    using VisibleIndex = MultiIndex<ndim_visible_>;
-    using HiddenIndex    = MultiIndex<ndim_hidden_>;
+    using HiddenIndex  = MultiIndex<ndim_hidden_>;
-    using Coordinate     = DynamicTensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
+    using Coordinate   = DynamicTensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
-    using CoordinateStep = DynamicTensorCoordinateStep<ntransform_, ndim_visible_>;
    public:
    __host__ __device__ explicit constexpr DynamicTensorDescriptor(const Transforms& transforms,
@@ -211,7 +193,11 @@ struct DynamicTensorCoordinate
    HiddenIndex idx_hidden_;
 };
+#if 0 // hack
 template <index_t NTransform, index_t NDimVisible>
+#else
+template <index_t NTransform, index_t NDimVisible, typename HackCalculateLowerIndexDiff>
+#endif
 struct DynamicTensorCoordinateStep
 {
    // TODO make these private
@@ -234,6 +220,11 @@ struct DynamicTensorCoordinateStep
    const VisibleIndex idx_diff_visible_;
    const MultiIndex<NTransform> do_transforms_;
+#if 1 // hack
+    // HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
+    static constexpr HackCalculateLowerIndexDiff hack_calculate_lower_index_diff_;
+#endif
 };
 // TODO: How to fix this? It uses an struct instead of lambda because lambda
@@ -406,7 +397,72 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
        set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
    });
+#if 0 // hack
    return DynamicTensorCoordinateStep<ntransform, ndim_visible>{idx_diff_visible, do_transforms};
+#else
+    return DynamicTensorCoordinateStep<ntransform,
+                                       ndim_visible,
+                                       typename uniform_sequence_gen<ntransform, 0>::type>{
+        idx_diff_visible, do_transforms};
+#endif
+}
+#if 0 // hack
+template <typename TensorDesc, typename VisibleIndex>
+#else
+// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
+template <typename TensorDesc, typename VisibleIndex, typename HackCalculateLowerIndexDiff>
+#endif
+__host__ __device__ constexpr auto make_dynamic_tensor_coordinate_step_hack(
+    const TensorDesc&, const VisibleIndex& idx_diff_visible, HackCalculateLowerIndexDiff)
+{
+    static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
+                  "wrong! # of dimension inconsistent");
+    constexpr index_t ntransform   = TensorDesc::GetNumOfTransform();
+    constexpr index_t ndim_hidden  = TensorDesc::GetNumOfHiddenDimension();
+    constexpr index_t ndim_visible = TensorDesc::GetNumOfVisibleDimension();
+    constexpr auto visible_dim_ids = TensorDesc::GetVisibleDimensionIds();
+    static_assert(HackCalculateLowerIndexDiff::Size() == ntransform, "wrong!");
+    // use index_t for boolean type
+    auto do_transforms    = make_zero_multi_index<ntransform>();
+    auto is_non_zero_diff = make_zero_multi_index<ndim_hidden>();
+    // decide do_transform by checkout non-zero index diff components
+    MultiIndex<VisibleIndex::Size()> non_zero_diff_pick_visible;
+    static_for<0, ndim_visible, 1>{}(
+        [&](auto i) { non_zero_diff_pick_visible(i) = (idx_diff_visible[i] != 0); });
+    set_container_subset(is_non_zero_diff, visible_dim_ids, non_zero_diff_pick_visible);
+    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
+        constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
+        constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);
+        const auto non_zero_diff_pick_up = get_container_subset(is_non_zero_diff, dims_up);
+        MultiIndex<dims_low.Size()> non_zero_diff_pick_low;
+        // if any of upper index diff components is non-zero, then
+        //   1) Need to do this transform
+        //   2) all components of lower index diff will assume to be non-zero and need to be
+        //   computed
+        const bool idx_diff_up_has_non_zero = container_reduce(
+            non_zero_diff_pick_up, [](auto a, auto b) constexpr { return a or b; }, false);
+        do_transforms(itran) = idx_diff_up_has_non_zero;
+        static_for<0, dims_low.Size(), 1>{}(
+            [&](auto i) { non_zero_diff_pick_low(i) = idx_diff_up_has_non_zero; });
+        set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
+    });
+    return DynamicTensorCoordinateStep<ntransform, ndim_visible, HackCalculateLowerIndexDiff>{
+        idx_diff_visible, do_transforms};
 }
 template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
@@ -453,7 +509,17 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
            MultiIndex<dims_low.Size()> idx_diff_low;
            // calculate idx_diff_low
+#if 0 // hack
            tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
+#else
+            // HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
+            // TODO remove hack
+            constexpr index_t Hack =
+                decltype(coord_step.hack_calculate_lower_index_diff_)::At(itran);
+            tran.CalculateLowerIndexDiff_hack(
+                idx_diff_low, idx_diff_up, idx_low, idx_up, Number<Hack>{});
+#endif
            // update idx_low
            idx_low += idx_diff_low;

--- a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
@@ -149,5 +149,146 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
    ThreadwiseTransfer threadwise_transfer_;
 };
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+template <index_t BlockSize,
+          InMemoryDataOperation DstInMemOp,
+          typename BlockSliceLengths,
+          typename ThreadSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          AddressSpace SrcAddressSpace,
+          AddressSpace DstAddressSpace,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          index_t ThreadTransferSrcResetCoordinateAfterRun,
+          index_t ThreadTransferDstResetCoordinateAfterRun>
+struct BlockwiseDynamicTensorSliceTransfer_v4_hack
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+    using Index = MultiIndex<nDim>;
+    __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4_hack(
+        const SrcDesc& src_desc,
+        const Index& src_block_slice_origin,
+        const DstDesc& dst_desc,
+        const Index& dst_block_slice_origin)
+        : threadwise_transfer_(
+              src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
+    {
+        static_assert(nDim == remove_reference_t<remove_cv_t<SrcDesc>>::GetNumOfDimension() &&
+                          nDim == remove_reference_t<remove_cv_t<DstDesc>>::GetNumOfDimension() &&
+                          nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+        static_assert(
+            is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+        static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! BlockSize too small");
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_id =
+                thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
+            const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_id_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_id_begin);
+        }
+    }
+    __device__ static constexpr auto CalculateThreadDataBegin()
+    {
+        const auto thread_cluster_id =
+            thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
+        return thread_cluster_id * ThreadSliceLengths{};
+    }
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_desc, p_src);
+        }
+    }
+    __device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunWrite(dst_desc, p_dst);
+        }
+    }
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+        }
+    }
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+    using ThreadwiseTransfer =
+        ThreadwiseDynamicTensorSliceTransfer_v3_hack<ThreadSliceLengths,
+                                                     DstInMemOp,
+                                                     SrcData,
+                                                     DstData,
+                                                     SrcDesc,
+                                                     DstDesc,
+                                                     SrcDimAccessOrder,
+                                                     DstDimAccessOrder,
+                                                     SrcVectorDim,
+                                                     DstVectorDim,
+                                                     SrcScalarPerVector,
+                                                     DstScalarPerVector,
+                                                     SrcScalarStrideInVector,
+                                                     DstScalarStrideInVector,
+                                                     SrcAddressSpace,
+                                                     DstAddressSpace,
+                                                     ThreadTransferSrcResetCoordinateAfterRun,
+                                                     ThreadTransferDstResetCoordinateAfterRun>;
+    ThreadwiseTransfer threadwise_transfer_;
+};
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
@@ -166,28 +166,28 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
        // B matrix blockwise copy
        auto b_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+            BlockwiseDynamicTensorSliceTransfer_v4_hack<BlockSize,
-                                                   InMemoryDataOperation::Set,
+                                                        InMemoryDataOperation::Set,
-                                                   Sequence<KPerBlock, NPerBlock>,
+                                                        Sequence<KPerBlock, NPerBlock>,
-                                                   BBlockTransferThreadSliceLengths_K_N,
+                                                        BBlockTransferThreadSliceLengths_K_N,
-                                                   BBlockTransferThreadClusterLengths_K_N,
+                                                        BBlockTransferThreadClusterLengths_K_N,
-                                                   BBlockTransferThreadClusterArrangeOrder,
+                                                        BBlockTransferThreadClusterArrangeOrder,
-                                                   Float,
+                                                        Float,
-                                                   Float,
+                                                        Float,
-                                                   decltype(b_k_n_global_desc),
+                                                        decltype(b_k_n_global_desc),
-                                                   decltype(b_k_n_block_desc),
+                                                        decltype(b_k_n_block_desc),
-                                                   BBlockTransferSrcAccessOrder,
+                                                        BBlockTransferSrcAccessOrder,
-                                                   Sequence<0, 1>,
+                                                        Sequence<0, 1>,
-                                                   BBlockTransferSrcVectorDim,
+                                                        BBlockTransferSrcVectorDim,
-                                                   1,
+                                                        1,
-                                                   BBlockTransferSrcScalarPerVector,
+                                                        BBlockTransferSrcScalarPerVector,
-                                                   BBlockTransferDstScalarPerVector_N,
+                                                        BBlockTransferDstScalarPerVector_N,
-                                                   AddressSpace::Global,
+                                                        AddressSpace::Global,
-                                                   AddressSpace::Lds,
+                                                        AddressSpace::Lds,
-                                                   1,
+                                                        1,
-                                                   1,
+                                                        1,
-                                                   BThreadTransferSrcResetCoordinateAfterRun,
+                                                        BThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(
+                                                        true>(
                b_k_n_global_desc,
                make_multi_index(0, n_block_data_on_global),
                b_k_n_block_desc,

--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -646,5 +646,348 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
    DstCoord dst_slice_origin_;
 };
+// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
+// and sometimes useless instructions
+// 1. It does not keep reference to tensor descriptor
+// 2. It does not construct new tensor coordinate for this->Run()
+// 3. It does not use pointer for VGPR thread buffer
+// 4. It calculate offset for thread buffer directly, instead of moving the coordinate
+template <typename SliceLengths,
+          InMemoryDataOperation DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          AddressSpace SrcAddressSpace,
+          AddressSpace DstAddressSpace,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
+    using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
+    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3_hack(const SrcDesc& src_desc,
+                                                                      const Index& src_slice_origin,
+                                                                      const DstDesc& dst_desc,
+                                                                      const Index& dst_slice_origin)
+        : src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_slice_origin_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
+    {
+        static_assert(SrcAddressSpace == AddressSpace::Global or
+                          SrcAddressSpace == AddressSpace::Lds,
+                      "wrong!");
+        static_assert(DstAddressSpace == AddressSpace::Global or
+                          DstAddressSpace == AddressSpace::Lds,
+                      "wrong!");
+    }
+    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3_hack()
+        : ThreadwiseDynamicTensorSliceTransfer_v3_hack(
+              SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
+    {
+    }
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_slice_origin_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_slice_origin_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src)
+    {
+        static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2,
+                      "wrong! hardcoded for 2D tensor");
+        // hardcoded for 2D
+        // TODO implemente N-D
+        if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
+        {
+#if 0 // hack
+      // TODO use constexpr for coordinate-step to make sure compiler behave correctly
+            const auto src_step_0_p1 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
+            const auto src_step_0_m1 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
+            const auto src_step_p1_0 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
+            const auto src_step_m1_0 =
+                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
+#elif 0
+            const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc,
+                make_multi_index(0, 1),
+                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
+            const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc,
+                make_multi_index(0, -1),
+                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
+            const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc,
+                make_multi_index(1, 0),
+                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
+            const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc,
+                make_multi_index(-1, 0),
+                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
+#elif 1
+            // for padded input tensor
+            const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc,
+                make_multi_index(0, 1),
+                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
+            const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc,
+                make_multi_index(0, -1),
+                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{});
+            const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc,
+                make_multi_index(1, 0),
+                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
+            const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc,
+                make_multi_index(-1, 0),
+                Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
+#elif 1
+            // for non-padded input tensor
+            const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{});
+            const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc, make_multi_index(0, -1), Sequence<0, 0, 0, 0, 0, 0, 2>{});
+            const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 1, 0>{});
+            const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
+                src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 2, 0>{});
+#endif
+            constexpr index_t Len0 = SliceLengths{}[0];
+            constexpr index_t Len1 = SliceLengths{}[1];
+            static_for<0, Len0, 1>{}([&](auto iter0) {
+                static_for<0, Len1, 1>{}([&](auto iter1) {
+                    // step direction
+                    constexpr bool forward_dim1 = (iter0.value % 2 == 0);
+                    constexpr index_t i0 = iter0.value;
+                    constexpr index_t i1 = forward_dim1 ? iter1.value : Len1 - iter1.value - 1;
+                    // do work
+                    constexpr index_t buffer_offset =
+                        buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
+                    // hardcoding for buffer_load
+                    // TODO refactor transfer_data() to encapsulate this
+                    static_assert(SrcAddressSpace == AddressSpace::Global,
+                                  "wrong! hardcoded to use buffer_load, src must be global mem");
+                    buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>(
+                        p_src,
+                        src_slice_origin_.GetOffset(),
+                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                            src_desc, src_slice_origin_),
+                        src_desc.GetElementSpaceSize());
+                    // move dim1 iterator
+                    if constexpr(iter1.value < Len1 - 1)
+                    {
+                        if constexpr(forward_dim1)
+                        {
+                            move_dynamic_tensor_coordinate(
+                                src_desc, src_slice_origin_, src_step_0_p1);
+                        }
+                        else
+                        {
+                            move_dynamic_tensor_coordinate(
+                                src_desc, src_slice_origin_, src_step_0_m1);
+                        }
+                    }
+                });
+                // move dim0 iterator
+                if constexpr(iter0.value < Len0 - 1)
+                {
+                    move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
+                }
+            });
+        }
+        // move src coordinate back to its slice origin
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_back_step =
+                make_dynamic_tensor_coordinate_step(src_desc, GetCoordinateBackStep());
+            move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step);
+        }
+    }
+    __device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst)
+    {
+        static_assert(remove_reference_t<DstDesc>::GetNumOfDimension() == 2,
+                      "wrong! hardcoded for 2D tensor");
+        // hardcoded for 2D
+        // TODO implement N-D
+        if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
+        {
+            // TODO use constexpr for coordinate-step to make sure compiler behave correctly
+            const auto dst_step_0_p1 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
+            const auto dst_step_0_m1 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
+            const auto dst_step_p1_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
+            const auto dst_step_m1_0 =
+                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
+            constexpr index_t Len0 = SliceLengths{}[0];
+            constexpr index_t Len1 = SliceLengths{}[1];
+            static_for<0, Len0, 1>{}([&](auto iter0) {
+                static_for<0, Len1, 1>{}([&](auto iter1) {
+                    // step direction
+                    constexpr bool forward_dim1 = (iter0.value % 2 == 0);
+                    constexpr index_t i0 = iter0;
+                    constexpr index_t i1 = forward_dim1 ? iter1.value : Len1 - iter1.value - 1;
+                    // do work
+                    constexpr index_t buffer_offset =
+                        buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
+                    // hardcoding for ds_write
+                    // TODO refactor transfer_data() to encapsulate this
+                    static_assert(DstAddressSpace == AddressSpace::Lds &&
+                                      DstInMemOp == InMemoryDataOperation::Set,
+                                  "wrong! hardcoded for ds_write");
+                    p_dst[dst_slice_origin_.GetOffset()] = buffer_[Number<buffer_offset>{}];
+                    // move dim1 iterator
+                    if constexpr(iter1.value < Len1 - 1)
+                    {
+                        if constexpr(forward_dim1)
+                        {
+                            move_dynamic_tensor_coordinate(
+                                dst_desc, dst_slice_origin_, dst_step_0_p1);
+                        }
+                        else
+                        {
+                            move_dynamic_tensor_coordinate(
+                                dst_desc, dst_slice_origin_, dst_step_0_m1);
+                        }
+                    }
+                });
+                // move dim0 iterator
+                if constexpr(iter0.value < Len0 - 1)
+                {
+                    move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0);
+                }
+            });
+        }
+        // move dst coordinate back to its slice origin
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_back_step =
+                make_dynamic_tensor_coordinate_step(dst_desc, GetCoordinateBackStep());
+            move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_back_step);
+        }
+    }
+    __device__ static constexpr auto GetCoordinateBackStep()
+    {
+        MultiIndex<nDim> back_step;
+        back_step(Number<0>{}) = 1 - SliceLengths{}[0];
+        static_for<1, nDim, 1>{}([&](auto i) {
+            back_step(i) = (SliceLengths{}[i - Number<1>{}] % 2 == 0) ? 0 : (1 - SliceLengths{}[i]);
+        });
+        return back_step;
+    }
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx = SrcResetCoordinateAfterRun
+                                           ? src_slice_origin_step_idx
+                                           : src_slice_origin_step_idx + GetCoordinateBackStep();
+        // is it OK to construct a new step every time?
+#if 0 // hack
+        const auto adjusted_step = make_dynamic_tensor_coordinate_step(
+            src_desc, adjusted_step_idx);
+#elif 1
+        // for padded input tensor
+        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
+            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
+#elif 1
+        // for non-paded input tensor
+        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
+            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});
+#endif
+        move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step);
+    }
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx = DstResetCoordinateAfterRun
+                                           ? dst_slice_origin_step_idx
+                                           : dst_slice_origin_step_idx + GetCoordinateBackStep();
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_dynamic_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+        move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, adjusted_step);
+    }
+    private:
+    static constexpr auto buffer_desc_ =
+        make_dynamic_naive_tensor_descriptor_packed<nDim>(to_multi_index(SliceLengths{}));
+    static constexpr index_t buffer_size_ = buffer_desc_.GetElementSpaceSize();
+    StaticallyIndexedArray<SrcData, buffer_size_> buffer_;
+    SrcCoord src_slice_origin_;
+    DstCoord dst_slice_origin_;
+};
 } // namespace ck
 #endif
--- a/driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -87,7 +87,40 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
-#elif 1
+#elif 0
+    // cdata = 64, BlockSize = 256, 128x128x2
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 2;
+    constexpr index_t GemmMPerThread = 4;
+    constexpr index_t GemmNPerThread = 4;
+    constexpr index_t GemmKPerThread = 1;
+    constexpr index_t GemmMLevel0Cluster = 2;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 8;
+    constexpr index_t GemmNLevel1Cluster = 8;
+    constexpr index_t ThreadGemmDataPerReadM = 4;
+    constexpr index_t ThreadGemmDataPerReadN = 4;
+    using GemmABlockCopyThreadSliceLengths_GemmK_GemmM   = Sequence<1, 1>;
+    using GemmABlockCopyThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
+    constexpr index_t GemmABlockCopySrcDataPerRead_GemmK  = 1;
+    constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM = 1;
+    using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN   = Sequence<1, 1>;
+    using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<2, 128>;
+    constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN  = 1;
+    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
+    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
+#elif 0
    // cdata = 64, BlockSize = 256, 128x128x4
    constexpr index_t BlockSize = 256;

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -53,7 +53,37 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    const auto in_left_pads   = to_multi_index(InLeftPads{});
    const auto in_right_pads  = to_multi_index(InRightPads{});
-#if 1
+#if 0
+    // cdata = 64, BlockSize = 256, 128x128x2
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 2;
+    constexpr index_t GemmMPerThread = 4;
+    constexpr index_t GemmNPerThread = 4;
+    constexpr index_t GemmKPerThread = 1;
+    constexpr index_t GemmMLevel0Cluster = 2;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 8;
+    constexpr index_t GemmNLevel1Cluster = 8;
+    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<1, 1>;
+    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
+    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<1, 1>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<2, 128>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 0
    // cdata = 64, BlockSize = 256, 128x128x4
    constexpr index_t BlockSize = 256;
@@ -138,7 +168,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
    constexpr auto conv_driver =
-#if 0 // debug
+#if 1 // debug
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
 #else
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -22,7 +22,7 @@ int main(int argc, char* argv[])
 {
    using namespace ck;
-#if 1
+#if 0
    // 3x3, 35x35, stride 2
    constexpr index_t N  = 128;
    constexpr index_t C  = 192;