fix control flow issue for padding case

01e94729 · Chao Liu · 68ea43b1 · 01e94729 · 01e94729 · 01e94729
Commit 01e94729 authored Jan 28, 2021 by Chao Liu
10 changed files
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -188,7 +188,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
            0,
            GemmABlockTransferSrcScalarPerVector_GemmK,
            GemmABlockTransferDstScalarPerVector_GemmM,
-            true, // move back src coordinate after threadwise copy
+            false, // don't move back src coordinate after threadwise copy
            GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
            GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
            Sequence<0, 1>,
@@ -623,5 +623,277 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
    }
 };
+template <index_t BlockSize,
+          typename Float,
+          typename AccFloat,
+          index_t GemmMPerBlock,
+          index_t GemmNPerBlock,
+          index_t GemmKPerBlock,
+          index_t GemmMPerThread,
+          index_t GemmNPerThread,
+          index_t GemmKPerThread,
+          index_t GemmMLevel0Cluster,
+          index_t GemmNLevel0Cluster,
+          index_t GemmMLevel1Cluster,
+          index_t GemmNLevel1Cluster,
+          typename GemmABlockTransferThreadSliceLengths_GemmK_GemmM,
+          typename GemmABlockTransferThreadClusterLengths_GemmK_GemmM,
+          index_t GemmABlockTransferSrcScalarPerVector_GemmK,
+          index_t GemmABlockTransferDstScalarPerVector_GemmM,
+          typename GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
+          typename GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
+          index_t GemmBBlockTransferSrcScalarPerVector_GemmN,
+          index_t GemmBBlockTransferDstScalarPerVector_GemmN,
+          index_t GemmCThreadTransferDstScalarPerVector_GemmN1>
+struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
+{
+    template <typename... Wei, typename... In, typename... Out>
+    __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+                      const MultiIndex<2> conv_strides,
+                      const MultiIndex<2> conv_dilations,
+                      const MultiIndex<2> in_left_pads,
+                      const MultiIndex<2> in_right_pads,
+                      const Float* __restrict__ p_wei_global,
+                      const Float* __restrict__ p_in_global,
+                      Float* __restrict__ p_out_global) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
+        const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
+        const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
+        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+        const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+        const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+        const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
+        const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
+        const index_t ConvStrideH = conv_strides[I0];
+        const index_t ConvStrideW = conv_strides[I1];
+        const index_t ConvDilationH = conv_dilations[I0];
+        const index_t ConvDilationW = conv_dilations[I1];
+        const index_t InLeftPadH = in_left_pads[I0];
+        const index_t InLeftPadW = in_left_pads[I1];
+        const index_t InRightPadH = in_right_pads[I0];
+        const index_t InRightPadW = in_right_pads[I1];
+        if(!(Y == 1 && X == 1 && ConvStrideH == 1 && ConvStrideW == 1 && ConvDilationH == 1 &&
+             ConvDilationW == 1 && InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 &&
+             InRightPadW == 0))
+        {
+            throw std::runtime_error("wrong! 1x1, stride 1, no padding");
+        }
+        // weight tensor
+        const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed<2>(make_multi_index(K, C)),
+            make_tuple(DynamicPassThrough{K}, DynamicPassThrough{C}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+        // input tensor
+        const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(DynamicPassThrough{C}, DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
+            make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        // output tensor
+        const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed<3>(make_multi_index(N, K, Ho * Wo)),
+            make_tuple(DynamicPassThrough{K}, DynamicMerge<2>{make_multi_index(N, Ho * Wo)}),
+            make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        const index_t GemmM = out_gemmm_gemmn_global_desc.GetLength(I0);
+        const index_t GemmN = out_gemmm_gemmn_global_desc.GetLength(I1);
+        const index_t GemmK = wei_gemmk_gemmm_global_desc.GetLength(I0);
+        if(!(GemmM % GemmMPerBlock == 0 && GemmN % GemmNPerBlock == 0 &&
+             GemmK % GemmKPerBlock == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+        constexpr index_t GemmM1 = GemmMPerThread * GemmMLevel0Cluster * GemmMLevel1Cluster;
+        constexpr index_t GemmN1 = GemmNPerThread * GemmNLevel0Cluster * GemmNLevel1Cluster;
+        const index_t GemmM0 = GemmM / GemmM1;
+        const index_t GemmN0 = GemmN / GemmN1;
+        const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
+            transform_dynamic_tensor_descriptor(
+                out_gemmm_gemmn_global_desc,
+                make_tuple(DynamicUnMerge<2>{make_multi_index(GemmM0, GemmM1)},
+                           DynamicUnMerge<2>{make_multi_index(GemmN0, GemmN1)}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+        // GEMM
+        using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1<
+            BlockSize,
+            Float,
+            AccFloat,
+            InMemoryDataOperation::Set,
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerThread,
+            GemmNPerThread,
+            GemmKPerThread,
+            GemmMLevel0Cluster,
+            GemmNLevel0Cluster,
+            GemmMLevel1Cluster,
+            GemmNLevel1Cluster,
+            GemmABlockTransferThreadSliceLengths_GemmK_GemmM,
+            GemmABlockTransferThreadClusterLengths_GemmK_GemmM,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            GemmABlockTransferSrcScalarPerVector_GemmK,
+            GemmABlockTransferDstScalarPerVector_GemmM,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
+            GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
+            Sequence<0, 1>,
+            Sequence<0, 1>,
+            1,
+            GemmBBlockTransferSrcScalarPerVector_GemmN,
+            GemmBBlockTransferDstScalarPerVector_GemmN,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<2, 3, 0, 1>,
+            3,
+            GemmCThreadTransferDstScalarPerVector_GemmN1>;
+        const index_t GridSize = (GemmM / GemmMPerBlock) * (GemmN / GemmNPerBlock);
+        const bool has_main_k_block_loop = (GemmK + GemmKPerBlock) / (2 * GemmKPerBlock) > 1;
+        const bool has_double_tail_k_block_loop = (GemmK / GemmKPerBlock) % 2 == 0;
+        if(has_main_k_block_loop && has_double_tail_k_block_loop)
+        {
+            const auto kernel =
+                run_gridwise_operation<gridwise_gemm,
+                                       decltype(wei_gemmk_gemmm_global_desc),
+                                       const Float*,
+                                       decltype(in_gemmk_gemmn_global_desc),
+                                       const Float*,
+                                       decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
+                                       Float*,
+                                       integral_constant<bool, true>,
+                                       integral_constant<bool, true>>;
+            launch_kernel(kernel,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          0,
+                          0,
+                          wei_gemmk_gemmm_global_desc,
+                          p_wei_global,
+                          in_gemmk_gemmn_global_desc,
+                          p_in_global,
+                          out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
+                          p_out_global,
+                          integral_constant<bool, true>{},
+                          integral_constant<bool, true>{});
+        }
+        else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+        {
+            const auto kernel =
+                run_gridwise_operation<gridwise_gemm,
+                                       decltype(wei_gemmk_gemmm_global_desc),
+                                       const Float*,
+                                       decltype(in_gemmk_gemmn_global_desc),
+                                       const Float*,
+                                       decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
+                                       Float*,
+                                       integral_constant<bool, true>,
+                                       integral_constant<bool, false>>;
+            launch_kernel(kernel,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          0,
+                          0,
+                          wei_gemmk_gemmm_global_desc,
+                          p_wei_global,
+                          in_gemmk_gemmn_global_desc,
+                          p_in_global,
+                          out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
+                          p_out_global,
+                          integral_constant<bool, true>{},
+                          integral_constant<bool, false>{});
+        }
+        else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+        {
+            const auto kernel =
+                run_gridwise_operation<gridwise_gemm,
+                                       decltype(wei_gemmk_gemmm_global_desc),
+                                       const Float*,
+                                       decltype(in_gemmk_gemmn_global_desc),
+                                       const Float*,
+                                       decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
+                                       Float*,
+                                       integral_constant<bool, false>,
+                                       integral_constant<bool, true>>;
+            launch_kernel(kernel,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          0,
+                          0,
+                          wei_gemmk_gemmm_global_desc,
+                          p_wei_global,
+                          in_gemmk_gemmn_global_desc,
+                          p_in_global,
+                          out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
+                          p_out_global,
+                          integral_constant<bool, false>{},
+                          integral_constant<bool, true>{});
+        }
+        else
+        {
+            const auto kernel =
+                run_gridwise_operation<gridwise_gemm,
+                                       decltype(wei_gemmk_gemmm_global_desc),
+                                       const Float*,
+                                       decltype(in_gemmk_gemmn_global_desc),
+                                       const Float*,
+                                       decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
+                                       Float*,
+                                       integral_constant<bool, false>,
+                                       integral_constant<bool, false>>;
+            launch_kernel(kernel,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          0,
+                          0,
+                          wei_gemmk_gemmm_global_desc,
+                          p_wei_global,
+                          in_gemmk_gemmn_global_desc,
+                          p_in_global,
+                          out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
+                          p_out_global,
+                          integral_constant<bool, false>{},
+                          integral_constant<bool, false>{});
+        }
+    }
+};
 } // namespace ck
 #endif
--- a/composable_kernel/include/gridwise_operation_wrapper.hpp
+++ b/composable_kernel/include/gridwise_operation_wrapper.hpp
@@ -2,7 +2,11 @@
 #define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
 template <typename GridwiseOp, typename... Xs>
-__global__ void run_gridwise_operation(Xs... xs)
+__global__ void
+#if 1
+    __launch_bounds__(256, 2)
+#endif
+        run_gridwise_operation(Xs... xs)
 {
    GridwiseOp{}.Run(xs...);
 }

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -848,13 +848,18 @@ struct DynamicMerge
                do_carry = idx_low_tmp >= low_lengths_[i];
 #if 0
-                // TODO: use exec-mask inline asm
+                // TODO: use exec-mask inline asm, which use 1 VALU
                if(do_carry)
                {
                    idx_diff_low(i) -= low_lengths_[i];
                }
-#else
+#elif 1
+                // this use 2 VALU
                idx_diff_low(i) = do_carry ? idx_diff_low[i] - low_lengths_[i] : idx_diff_low[i];
+#elif 1
+                // this use 2 VALU
+                index_t idx_diff_low_tmp = idx_diff_low[i] - low_lengths_[i];
+                idx_diff_low(i)          = do_carry ? idx_diff_low_tmp : idx_diff_low[i];
 #endif
                idx_low(i) += idx_diff_low[i];
@@ -885,8 +890,11 @@ struct DynamicMerge
                {
                    idx_diff_low(i) += low_lengths_[i];
                }
-#else
+#elif 1
                idx_diff_low(i) = do_borrow ? idx_diff_low[i] + low_lengths_[i] : idx_diff_low[i];
+#elif 1
+                index_t idx_diff_low_tmp = idx_diff_low[i] + low_lengths_[i];
+                idx_diff_low(i)          = do_borrow ? idx_diff_low_tmp : idx_diff_low[i];
 #endif
                idx_low(i) += idx_diff_low[i];

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
@@ -541,7 +541,12 @@ coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc& te
            const auto idx_up =
                get_container_subset(idx_hidden, TensorDesc::GetUpperDimensionIdss().At(itran));
+#if 0 // debug
+      // Comment: this implemenetation results in weird control flow in ISA
            valid = valid && tran.IsValidUpperIndexMappedToValidLowerIndex(idx_up);
+#else
+            valid &= tran.IsValidUpperIndexMappedToValidLowerIndex(idx_up);
+#endif
        }
    });

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
@@ -322,7 +322,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
            } while(k_block_data_begin < K - 2 * KPerBlock);
        }
-#if 1
        // LDS double buffer: tail
        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
        {
@@ -356,7 +355,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
            // LDS double buffer: GEMM on last data
            blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread);
        }
-#endif
        // output: register to global memory
        {
@@ -385,33 +383,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
            const index_t n_thread_data_on_global =
                n_block_data_on_global + c_thread_mtx_on_block.col;
-#if 0
-            ThreadwiseDynamicTensorSliceTransfer_v1r2<
-                AccFloat,
-                Float,
-                decltype(c_m0_m1_n0_n1_thread_desc),
-                decltype(c_m0_m1_n0_n1_global_desc),
-                Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
-                CThreadTransferSrcDstAccessOrder,
-                CThreadTransferSrcDstVectorDim,
-                1,
-                CThreadTransferDstScalarPerVector,
-                AddressSpace::Vgpr,
-                AddressSpace::Global,
-                CGlobalMemoryDataOperation,
-                1,
-                1,
-                true,
-                true>(c_m0_m1_n0_n1_thread_desc,
-                      make_multi_index(0, 0, 0, 0),
-                      c_m0_m1_n0_n1_global_desc,
-                      make_multi_index(m_thread_data_on_global / M1,
-                                       m_thread_data_on_global % M1,
-                                       n_thread_data_on_global / N1,
-                                       n_thread_data_on_global % N1))
-                .Run_hack(
-                    c_m0_m1_n0_n1_thread_desc, p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
-#else
            ThreadwiseDynamicTensorSliceTransfer_v1r3<
                AccFloat,
                Float,
@@ -432,7 +403,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
                                       n_thread_data_on_global / N1,
                                       n_thread_data_on_global % N1))
                .Run_hack(p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
-#endif
        }
    }

--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -7,405 +7,6 @@
 namespace ck {
-// this version is less likely to have scratch memory issue, due to:
-// 1. It does not keep reference to tensor descriptor
-// 2. It does not construct new tensor coordinate for this->Run()
-template <typename SrcData,
-          typename DstData,
-          typename SrcDesc,
-          typename DstDesc,
-          typename SliceLengths,
-          typename SrcDstDimAccessOrder,
-          index_t SrcDstVectorDim,
-          index_t SrcScalarPerVector,
-          index_t DstScalarPerVector,
-          AddressSpace SrcAddressSpace,
-          AddressSpace DstAddressSpace,
-          InMemoryDataOperation DstInMemOp,
-          index_t SrcScalarStrideInVector,
-          index_t DstScalarStrideInVector,
-          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
-                                           // Run(),  will be fused with MoveSrcSliceWindow to
-                                           // save addr computation
-          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
-                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
-                                           // save addr computation
-struct ThreadwiseDynamicTensorSliceTransfer_v1r2
-{
-    static constexpr index_t nDim = SliceLengths::Size();
-    using Index                   = MultiIndex<nDim>;
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
-    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
-    using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
-    using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r2(const SrcDesc& src_desc,
-                                                                   const Index& src_slice_origin,
-                                                                   const DstDesc& dst_desc,
-                                                                   const Index& dst_slice_origin)
-        : src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
-          dst_slice_origin_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
-    {
-    }
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r2()
-        : ThreadwiseDynamicTensorSliceTransfer_v1r2(
-              SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
-    {
-    }
-    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
-    {
-        src_slice_origin_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
-    }
-    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
-    {
-        dst_slice_origin_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
-    }
-    __device__ void
-    Run_hack(const SrcDesc& src_desc, const SrcData* p_src, const DstDesc& dst_desc, DstData* p_dst)
-    {
-        if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
-        {
-            // TODO use constexpr for coordinate-step to make sure compiler behave correctly
-            const auto src_step_0_p1 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
-            const auto src_step_0_m1 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
-            const auto src_step_p1_0 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
-            const auto src_step_m1_0 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
-            const auto dst_step_0_p1 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
-            const auto dst_step_0_m1 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
-            const auto dst_step_p1_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
-            const auto dst_step_m1_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
-            constexpr index_t Len0 = SliceLengths{}[0];
-            constexpr index_t Len1 = SliceLengths{}[1];
-#pragma unroll
-            for(index_t iter0 = 0; iter0 < Len0; ++iter0)
-            {
-#pragma unroll
-                for(index_t iter1 = 0; iter1 < Len1; ++iter1)
-                {
-                    // do work
-                    transfer_data<SrcData,
-                                  1,
-                                  SrcAddressSpace,
-                                  DstAddressSpace,
-                                  DstInMemOp,
-                                  SrcScalarStrideInVector,
-                                  DstScalarStrideInVector>(
-                        p_src,
-                        src_slice_origin_.GetOffset(),
-                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                            src_desc, src_slice_origin_),
-                        src_desc.GetElementSpaceSize(),
-                        p_dst,
-                        dst_slice_origin_.GetOffset(),
-                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                            dst_desc, dst_slice_origin_),
-                        dst_desc.GetElementSpaceSize());
-                    // move dim1 iterator
-                    if(iter1 < Len1 - 1)
-                    {
-                        bool forward_dim1 = (iter0 % 2 == 0);
-                        if(forward_dim1)
-                        {
-                            move_dynamic_tensor_coordinate(
-                                src_desc, src_slice_origin_, src_step_0_p1);
-                            move_dynamic_tensor_coordinate(
-                                dst_desc, dst_slice_origin_, dst_step_0_p1);
-                        }
-                        else
-                        {
-                            move_dynamic_tensor_coordinate(
-                                src_desc, src_slice_origin_, src_step_0_m1);
-                            move_dynamic_tensor_coordinate(
-                                dst_desc, dst_slice_origin_, dst_step_0_m1);
-                        }
-                    }
-                }
-                // move dim0 iterator
-                if(iter0 < Len0 - 1)
-                {
-                    move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
-                    move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0);
-                }
-            }
-        }
-        else if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 4)
-        {
-        // TODO use constexpr for coordinate-step to make sure compiler behave correctly
-#if 0
-            const auto src_step_0_0_0_p1 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
-            const auto src_step_0_0_0_m1 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
-            const auto src_step_0_0_p1_0 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
-            const auto src_step_0_0_m1_0 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
-            const auto src_step_0_p1_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
-            const auto src_step_0_m1_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
-            const auto src_step_p1_0_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
-            const auto src_step_m1_0_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
-            const auto dst_step_0_0_0_p1 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, 1));
-            const auto dst_step_0_0_0_m1 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, -1));
-            const auto dst_step_0_0_p1_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 1, 0));
-            const auto dst_step_0_0_m1_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, -1, 0));
-            const auto dst_step_0_p1_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
-            const auto dst_step_0_m1_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
-            const auto dst_step_p1_0_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
-            const auto dst_step_m1_0_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
-#else
-            // hack for output tensor
-            const auto src_step_0_0_0_p1 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
-            const auto src_step_0_0_0_m1 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
-            const auto src_step_0_0_p1_0 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
-            const auto src_step_0_0_m1_0 =
-                make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
-            const auto src_step_0_p1_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
-            const auto src_step_0_m1_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
-            const auto src_step_p1_0_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
-            const auto src_step_m1_0_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
-            const auto dst_step_0_0_0_p1 = make_dynamic_tensor_coordinate_step_hack(
-                dst_desc, make_multi_index(0, 0, 0, 1), Sequence<0, 0, 1, 0, 0>{});
-            const auto dst_step_0_0_0_m1 = make_dynamic_tensor_coordinate_step_hack(
-                dst_desc, make_multi_index(0, 0, 0, -1), Sequence<0, 0, 2, 0, 0>{});
-            const auto dst_step_0_0_p1_0 = make_dynamic_tensor_coordinate_step_hack(
-                dst_desc, make_multi_index(0, 0, 1, 0), Sequence<0, 0, 1, 0, 0>{});
-            const auto dst_step_0_0_m1_0 = make_dynamic_tensor_coordinate_step_hack(
-                dst_desc, make_multi_index(0, 0, -1, 0), Sequence<0, 0, 2, 0, 0>{});
-            const auto dst_step_0_p1_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
-            const auto dst_step_0_m1_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
-            const auto dst_step_p1_0_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
-            const auto dst_step_m1_0_0_0 =
-                make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
-#endif
-            constexpr index_t Len0 = SliceLengths{}[0];
-            constexpr index_t Len1 = SliceLengths{}[1];
-            constexpr index_t Len2 = SliceLengths{}[2];
-            constexpr index_t Len3 = SliceLengths{}[3];
-#pragma unroll
-            for(index_t iter0 = 0; iter0 < Len0; ++iter0)
-            {
-#pragma unroll
-                for(index_t iter1 = 0; iter1 < Len1; ++iter1)
-                {
-#pragma unroll
-                    for(index_t iter2 = 0; iter2 < Len2; ++iter2)
-                    {
-#pragma unroll
-                        for(index_t iter3 = 0; iter3 < Len3; ++iter3)
-                        {
-                            // do work
-                            transfer_data<SrcData,
-                                          1,
-                                          SrcAddressSpace,
-                                          DstAddressSpace,
-                                          DstInMemOp,
-                                          SrcScalarStrideInVector,
-                                          DstScalarStrideInVector>(
-                                p_src,
-                                src_slice_origin_.GetOffset(),
-                                coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                                    src_desc, src_slice_origin_),
-                                src_desc.GetElementSpaceSize(),
-                                p_dst,
-                                dst_slice_origin_.GetOffset(),
-                                coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                                    dst_desc, dst_slice_origin_),
-                                dst_desc.GetElementSpaceSize());
-                            // move dim1 iterator
-                            if(iter3 < Len3 - 1)
-                            {
-                                bool forward_dim3 = (iter2 % 2 == 0);
-                                if(forward_dim3)
-                                {
-                                    move_dynamic_tensor_coordinate(
-                                        src_desc, src_slice_origin_, src_step_0_0_0_p1);
-                                    move_dynamic_tensor_coordinate(
-                                        dst_desc, dst_slice_origin_, dst_step_0_0_0_p1);
-                                }
-                                else
-                                {
-                                    move_dynamic_tensor_coordinate(
-                                        src_desc, src_slice_origin_, src_step_0_0_0_m1);
-                                    move_dynamic_tensor_coordinate(
-                                        dst_desc, dst_slice_origin_, dst_step_0_0_0_m1);
-                                }
-                            }
-                        }
-                        // move dim1 iterator
-                        if(iter2 < Len2 - 1)
-                        {
-                            bool forward_dim2 = (iter1 % 2 == 0);
-                            if(forward_dim2)
-                            {
-                                move_dynamic_tensor_coordinate(
-                                    src_desc, src_slice_origin_, src_step_0_0_p1_0);
-                                move_dynamic_tensor_coordinate(
-                                    dst_desc, dst_slice_origin_, dst_step_0_0_p1_0);
-                            }
-                            else
-                            {
-                                move_dynamic_tensor_coordinate(
-                                    src_desc, src_slice_origin_, src_step_0_0_m1_0);
-                                move_dynamic_tensor_coordinate(
-                                    dst_desc, dst_slice_origin_, dst_step_0_0_m1_0);
-                            }
-                        }
-                    }
-                    // move dim1 iterator
-                    if(iter1 < Len1 - 1)
-                    {
-                        bool forward_dim1 = (iter0 % 2 == 0);
-                        if(forward_dim1)
-                        {
-                            move_dynamic_tensor_coordinate(
-                                src_desc, src_slice_origin_, src_step_0_p1_0_0);
-                            move_dynamic_tensor_coordinate(
-                                dst_desc, dst_slice_origin_, dst_step_0_p1_0_0);
-                        }
-                        else
-                        {
-                            move_dynamic_tensor_coordinate(
-                                src_desc, src_slice_origin_, src_step_0_m1_0_0);
-                            move_dynamic_tensor_coordinate(
-                                dst_desc, dst_slice_origin_, dst_step_0_m1_0_0);
-                        }
-                    }
-                }
-                // move dim0 iterator:
-                if(iter0 < Len0 - 1)
-                {
-                    // move forward in dim0
-                    move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0_0_0);
-                    move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0_0_0);
-                }
-            }
-        }
-        // move src and dst coordinate back to their origins
-        if constexpr(SrcResetCoordinateAfterRun)
-        {
-            const auto src_back_step =
-                make_dynamic_tensor_coordinate_step(src_desc, GetCoordinateBackStep());
-            move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step);
-        }
-        if constexpr(DstResetCoordinateAfterRun)
-        {
-            const auto dst_back_step =
-                make_dynamic_tensor_coordinate_step(dst_desc, GetCoordinateBackStep());
-            move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_back_step);
-        }
-    }
-    __device__ static constexpr auto GetCoordinateBackStep()
-    {
-        MultiIndex<nDim> back_step;
-        back_step(Number<0>{}) = 1 - SliceLengths{}[0];
-        static_for<1, nDim, 1>{}([&](auto i) {
-            back_step(i) = (SliceLengths{}[i - Number<1>{}] % 2 == 0) ? 0 : (1 - SliceLengths{}[i]);
-        });
-        return back_step;
-    }
-    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
-                                       const Index& src_slice_origin_step_idx)
-    {
-        // is it OK to construct a new step every time?
-        const auto src_slice_origin_step =
-            make_dynamic_tensor_coordinate_step(src_desc, src_slice_origin_step_idx);
-        move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_slice_origin_step);
-    }
-    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
-                                       const Index& dst_slice_origin_step_idx)
-    {
-        // is it OK to construct a new step every time?
-        const auto dst_slice_origin_step =
-            make_dynamic_tensor_coordinate_step(dst_desc, dst_slice_origin_step_idx);
-        move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_slice_origin_step);
-    }
-    private:
-    SrcCoord src_slice_origin_;
-    DstCoord dst_slice_origin_;
-};
 // this version is less likely to have scratch memory issue, due to:
 // 1. It does not keep reference to tensor descriptor
 // 2. It does not construct new tensor coordinate for this->Run()
@@ -1125,7 +726,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
            src_desc,
            make_multi_index(-1, 0) * src_scalar_per_access,
            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
-#elif 1
+#elif 0
        // for non-padded input tensor
        const auto src_step_0_p =
            make_dynamic_tensor_coordinate_step_hack(src_desc,
@@ -1146,6 +747,19 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
            src_desc,
            make_multi_index(-1, 0) * src_scalar_per_access,
            Sequence<0, 0, 0, 0, 0, 2, 0>{});
+#elif 1
+        // for 1x1, input tensor
+        const auto src_step_0_p = make_dynamic_tensor_coordinate_step_hack(
+            src_desc, make_multi_index(0, 1) * src_scalar_per_access, Sequence<0, 0, 1>{});
+        const auto src_step_0_m = make_dynamic_tensor_coordinate_step_hack(
+            src_desc, make_multi_index(0, -1) * src_scalar_per_access, Sequence<0, 0, 2>{});
+        const auto src_step_p_0 = make_dynamic_tensor_coordinate_step(
+            src_desc, make_multi_index(1, 0) * src_scalar_per_access);
+        const auto src_step_m_0 = make_dynamic_tensor_coordinate_step(
+            src_desc, make_multi_index(-1, 0) * src_scalar_per_access);
 #endif
        constexpr auto I0 = Number<0>{};
@@ -1171,13 +785,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                vector_type<SrcData, SrcScalarPerVector> src_vector;
+#if 1
                src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
                    p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
                const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
                    src_desc, src_slice_origin_);
-#if 1
                src_vector.Vector() = is_valid ? src_vector.Vector() : SrcVectorType{0};
                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
@@ -1186,12 +800,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                    buffer_(Number<buffer_offset>{}) = src_vector[i];
                });
-#elif 0
+#elif 1
+                const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                    src_desc, src_slice_origin_);
+                src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
+                    p_src, src_slice_origin_.GetOffset(), is_valid, src_desc.GetElementSpaceSize());
                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
                    constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
                        make_multi_index(i0, i1) + i * src_scalar_step_in_vector);
-                    buffer_(Number<buffer_offset>{}) = is_valid ? src_vector[i] : SrcData{0};
+                    buffer_(Number<buffer_offset>{}) = src_vector[i];
                });
 #elif 0
                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
@@ -1357,10 +977,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
        // for padded input tensor
        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
-#elif 1
+#elif 0
-        // for non-paded input tensor
+        // for non-padded input tensor
        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});
+#elif 1
+        // for 1x1, input tensor
+        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
+            src_desc, adjusted_step_idx, Sequence<0, 1, 2>{});
 #endif
        move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step);

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -87,7 +87,7 @@
 // thread-invariant, otherwise it's a bug
 // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
 #ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
-#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
+#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 1
 #endif
 // workaround: put all workaround here

--- a/driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -187,7 +187,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
-#elif 1
+#elif 0
    // cdata = 64, BlockSize = 256, 128x128x8
    // b threadwise copy 2x2
    constexpr index_t BlockSize = 256;
@@ -221,7 +221,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
-#elif 1
+#elif 0
    // cdata = 64, BlockSize = 256, 128x128x8
    // vector 4
    constexpr index_t BlockSize = 256;
@@ -323,7 +323,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 4;
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 4;
-#elif 0
+#elif 1
    // cdata = 64, BlockSize = 256, 128x128x16
    // GemmBBlockCopySrcDataPerRead_GemmN = 4
    // GemmCThreadCopyDstDataPerWrite_GemmN1 = 4

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -104,7 +104,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<2, 1>;
    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 2;
    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<2, 1>;
@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
-#elif 1
+#elif 0
    // cdata = 64, BlockSize = 256, 128x128x8
    // b thread copy 2x2
    constexpr index_t BlockSize = 256;
@@ -176,6 +176,38 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 1
+    // cdata = 64, BlockSize = 256, 128x128x16
+    // GemmBBlockCopySrcDataPerRead_GemmN = 4
+    // GemmCThreadCopyDstDataPerWrite_GemmN1 = 4
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 16;
+    constexpr index_t GemmMPerThread = 4;
+    constexpr index_t GemmNPerThread = 4;
+    constexpr index_t GemmKPerThread = 1;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<4, 2>;
+    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 64>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
+    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<8, 32>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 4;
 #endif
    const index_t N  = out_n_k_ho_wo_desc.GetLength(I0);
@@ -203,8 +235,10 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    constexpr auto conv_driver =
 #if 1
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
-#else
+#elif 0
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
+#elif 1
+        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
 #endif
        <BlockSize,
         TDevice,

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
    constexpr index_t C  = 96;
    constexpr index_t HI = 35;
    constexpr index_t WI = 35;
-    constexpr index_t K  = 96;
+    constexpr index_t K  = 128;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
@@ -133,7 +133,7 @@ int main(int argc, char* argv[])
    constexpr index_t C  = 128;
    constexpr index_t HI = 17;
    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
+    constexpr index_t K  = 384;
    constexpr index_t Y  = 7;
    constexpr index_t X  = 1;
@@ -175,10 +175,10 @@ int main(int argc, char* argv[])
 #elif 0
    // 3x3, 147x147
    constexpr index_t N  = 128;
-    constexpr index_t C  = 32;
+    constexpr index_t C  = 128;
    constexpr index_t HI = 147;
    constexpr index_t WI = 147;
-    constexpr index_t K  = 64;
+    constexpr index_t K  = 128;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
@@ -457,7 +457,7 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
    // 1x1, 7x7
    constexpr index_t N  = 128;
    constexpr index_t C  = 512;