add non-pad

3d66465a · Jing Zhang · bf111ac6 · 3d66465a · 3d66465a · 3d66465a
Commit 3d66465a authored Apr 16, 2021 by Jing Zhang
7 changed files
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -31,6 +31,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
 {
    template <typename... Wei,
              typename... In,
+              typename... Add,
              typename... Out,
              typename ConvStrides,
              typename ConvDilations,
@@ -38,6 +39,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
              typename InRightPads>
    __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
                      const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const DynamicTensorDescriptor<Add...>& add_n_k0_hox2_wox2_k1_global_desc,
                      const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
                      const ConvStrides& conv_strides,
                      const ConvDilations& conv_dilations,
@@ -45,6 +47,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
                      const InRightPads& in_right_pads,
                      const FloatAB* __restrict__ p_wei_global,
                      const FloatAB* __restrict__ p_in_global,
+                      const FloatC* __restrict__ p_d_global,
                      FloatC* __restrict__ p_out_global) const
    {
        constexpr auto I0 = Number<0>{};
@@ -63,6 +66,9 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+        const auto Hox2 = Ho * 2;
+        const auto Wox2 = Wo * 2;
        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
        const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
@@ -127,6 +133,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        // add tensor
+        const auto add_k_n_hopx2_wopx2_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Hox2, Wox2)),
+            make_tuple(make_pass_through_transform(K0),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hox2),
+                       make_pass_through_transform(Wox2)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
        const auto E = C * Y * X;
        if(!((K % KPerBlock) == 0 && (Ho % HoPerBlock) == 0 && (Wo % WoPerBlock) == 0 &&
@@ -158,16 +174,15 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
        // hack for NKHW format
        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
+                       make_tuple(Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{}));
-#if 0
        // GEMM
        using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v3<
            BlockSize,
@@ -177,6 +192,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
            InMemoryDataOperation::Set,
            decltype(wei_e_k_global_desc),
            decltype(in_e_n_ho_wo_global_desc),
+            decltype(add_k_n_hopx2_wopx2_global_desc),
            decltype(out_k_n_ho_wo_global_desc),
            KPerBlock,
            HoPerBlock,
@@ -228,13 +244,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
            for(index_t j = 0; j < nrepeat; ++j)
            {
-                if(has_main_k_block_loop && has_double_tail_k_block_loop)
+#if 0
+                //if(has_main_k_block_loop && has_double_tail_k_block_loop)
                {
                    const auto kernel = run_gridwise_operation<gridwise_gemm,
                                                               decltype(wei_e_k_global_desc),
                                                               const FloatAB*,
                                                               decltype(in_e_n_ho_wo_global_desc),
                                                               const FloatAB*,
+                                                               decltype(add_k_n_hopx2_wopx2_global_desc),
+                                                               const FloatC*,
                                                               decltype(out_k_n_ho_wo_global_desc),
                                                               FloatC*,
                                                               integral_constant<bool, true>,
@@ -249,22 +268,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
                                  p_wei_global,
                                  in_e_n_ho_wo_global_desc,
                                  p_in_global,
+                                  add_k_n_hopx2_wopx2_global_desc,
+                                  p_d_global,
                                  out_k_n_ho_wo_global_desc,
                                  p_out_global,
                                  integral_constant<bool, true>{},
                                  integral_constant<bool, true>{});
                }
-                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+#elif 1
+                // else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
                {
-                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                    const auto kernel =
-                                                               decltype(wei_e_k_global_desc),
+                        run_gridwise_operation<gridwise_gemm,
-                                                               const FloatAB*,
+                                               decltype(wei_e_k_global_desc),
-                                                               decltype(in_e_n_ho_wo_global_desc),
+                                               const FloatAB*,
-                                                               const FloatAB*,
+                                               decltype(in_e_n_ho_wo_global_desc),
-                                                               decltype(out_k_n_ho_wo_global_desc),
+                                               const FloatAB*,
-                                                               FloatC*,
+                                               decltype(add_k_n_hopx2_wopx2_global_desc),
-                                                               integral_constant<bool, true>,
+                                               const FloatC*,
-                                                               integral_constant<bool, false>>;
+                                               decltype(out_k_n_ho_wo_global_desc),
+                                               FloatC*,
+                                               integral_constant<bool, true>,
+                                               integral_constant<bool, false>>;
                    launch_kernel(kernel,
                                  dim3(GridSize),
@@ -275,22 +300,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
                                  p_wei_global,
                                  in_e_n_ho_wo_global_desc,
                                  p_in_global,
+                                  add_k_n_hopx2_wopx2_global_desc,
+                                  p_d_global,
                                  out_k_n_ho_wo_global_desc,
                                  p_out_global,
                                  integral_constant<bool, true>{},
                                  integral_constant<bool, false>{});
                }
-                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+#elif 1
+                // else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
                {
-                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                    const auto kernel =
-                                                               decltype(wei_e_k_global_desc),
+                        run_gridwise_operation<gridwise_gemm,
-                                                               const FloatAB*,
+                                               decltype(wei_e_k_global_desc),
-                                                               decltype(in_e_n_ho_wo_global_desc),
+                                               const FloatAB*,
-                                                               const FloatAB*,
+                                               decltype(in_e_n_ho_wo_global_desc),
-                                                               decltype(out_k_n_ho_wo_global_desc),
+                                               const FloatAB*,
-                                                               FloatC*,
+                                               decltype(add_k_n_hopx2_wopx2_global_desc),
-                                                               integral_constant<bool, false>,
+                                               const FloatC*,
-                                                               integral_constant<bool, true>>;
+                                               decltype(out_k_n_ho_wo_global_desc),
+                                               FloatC*,
+                                               integral_constant<bool, false>,
+                                               integral_constant<bool, true>>;
                    launch_kernel(kernel,
                                  dim3(GridSize),
@@ -301,22 +332,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
                                  p_wei_global,
                                  in_e_n_ho_wo_global_desc,
                                  p_in_global,
+                                  add_k_n_hopx2_wopx2_global_desc,
+                                  p_d_global,
                                  out_k_n_ho_wo_global_desc,
                                  p_out_global,
                                  integral_constant<bool, false>{},
                                  integral_constant<bool, true>{});
                }
-                else
+#elif 1
+                // else
                {
-                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                    const auto kernel =
-                                                               decltype(wei_e_k_global_desc),
+                        run_gridwise_operation<gridwise_gemm,
-                                                               const FloatAB*,
+                                               decltype(wei_e_k_global_desc),
-                                                               decltype(in_e_n_ho_wo_global_desc),
+                                               const FloatAB*,
-                                                               const FloatAB*,
+                                               decltype(in_e_n_ho_wo_global_desc),
-                                                               decltype(out_k_n_ho_wo_global_desc),
+                                               const FloatAB*,
-                                                               FloatC*,
+                                               decltype(add_k_n_hopx2_wopx2_global_desc),
-                                                               integral_constant<bool, false>,
+                                               const FloatC*,
-                                                               integral_constant<bool, false>>;
+                                               decltype(out_k_n_ho_wo_global_desc),
+                                               FloatC*,
+                                               integral_constant<bool, false>,
+                                               integral_constant<bool, false>>;
                    launch_kernel(kernel,
                                  dim3(GridSize),
@@ -327,11 +364,14 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
                                  p_wei_global,
                                  in_e_n_ho_wo_global_desc,
                                  p_in_global,
+                                  add_k_n_hopx2_wopx2_global_desc,
+                                  p_d_global,
                                  out_k_n_ho_wo_global_desc,
                                  p_out_global,
                                  integral_constant<bool, false>{},
                                  integral_constant<bool, false>{});
                }
+#endif
            }
            timer.End();
@@ -346,7 +386,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
                      << std::endl;
        }
-#endif
    }
 };
 } // namespace ck

--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
@@ -192,11 +192,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
        // hack for NKHW format
        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
+                       make_tuple(Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{},
                                  Sequence<0, 0, 0, 0, 0>{}));
@@ -262,7 +262,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
            for(index_t j = 0; j < nrepeat; ++j)
            {
-                if(has_main_k_block_loop && has_double_tail_k_block_loop)
+#if 0
+                //if(has_main_k_block_loop && has_double_tail_k_block_loop)
                {
                    const auto kernel =
                        run_gridwise_operation<gridwise_gemm,
@@ -293,7 +294,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
                                  integral_constant<bool, true>{},
                                  integral_constant<bool, true>{});
                }
-                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+#elif 1
+                // else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
                {
                    const auto kernel =
                        run_gridwise_operation<gridwise_gemm,
@@ -324,7 +326,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
                                  integral_constant<bool, true>{},
                                  integral_constant<bool, false>{});
                }
-                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+#elif 1
+                // else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
                {
                    const auto kernel =
                        run_gridwise_operation<gridwise_gemm,
@@ -355,7 +358,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
                                  integral_constant<bool, false>{},
                                  integral_constant<bool, true>{});
                }
-                else
+#elif 1
+                // else
                {
                    const auto kernel =
                        run_gridwise_operation<gridwise_gemm,
@@ -386,6 +390,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
                                  integral_constant<bool, false>{},
                                  integral_constant<bool, false>{});
                }
+#endif
            }
            timer.End();

--- a/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
@@ -134,8 +134,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
        constexpr auto KPerThreadSubC = 4;
-        constexpr auto HoPerThreadSubC = 2;
+        constexpr auto HoPerThreadSubC = HPerThread;
-        constexpr auto WoPerThreadSubC = 2;
+        constexpr auto WoPerThreadSubC = WPerThread;
        static_assert(KPerThread % KPerThreadSubC == 0, "");
        static_assert(HPerThread % HoPerThreadSubC == 0, "");

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
@@ -175,6 +175,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
        const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
        const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
+        const index_t k_thread_data_on_global = k_block_data_on_global + k_thread_id * KPerThread;
        const index_t ho_thread_data_on_global =
            ho_block_data_on_global + ho_thread_id * HoPerThread;
        const index_t wo_thread_data_on_global =
@@ -262,11 +263,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
            a_blockwise_copy.RunWrite(a_e_k_desc, p_a_block);
        }
-        __syncthreads();
+        block_sync_lds();
+        //__syncthreads();
-#if 1
        constexpr auto KPerThreadAdd = KPerThread / CThreadTransferDstScalarPerVector;
        constexpr auto HoPerThreadx2 = HoPerThread * 2;
        constexpr auto WoPerThreadx2 = WoPerThread * 2;
@@ -281,25 +281,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
-        const index_t hox2_block_data_on_global = ho_block_work_id * HoPerBlock * 2;
+        const index_t hox2_thread_data_on_global = ho_thread_data_on_global * 2;
-        const index_t wox2_block_data_on_global = wo_block_work_id * WoPerBlock * 2;
+        const index_t wox2_thread_data_on_global = wo_thread_data_on_global * 2;
+        const index_t k_thread_data_on_global_add =
-        const index_t hox2_thread_data_on_global =
+            k_thread_data_on_global / CThreadTransferDstScalarPerVector;
-            hox2_block_data_on_global + ho_thread_id * HoPerThreadx2;
-        const index_t wox2_thread_data_on_global =
-            wox2_block_data_on_global + wo_thread_id * WoPerThreadx2;
        static_assert(KPerThread % CThreadTransferDstScalarPerVector == 0, "");
        static_assert(CThreadTransferDstScalarPerVector == 16, "");
-        const index_t k_block_data_on_global_add =
-            k_block_work_id * KPerBlock / CThreadTransferDstScalarPerVector;
-        const index_t k_thread_data_on_global_add =
-            k_block_data_on_global_add + k_thread_id * KPerThreadAdd;
        static_assert(vec_len == 256, "");
        vector_type<int8_t, vec_len> d_vec;
+#if 1
        {
            ThreadwiseDynamicTensorSliceTransfer_v2<
@@ -419,104 +412,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
 #endif
        // output: register to global memory
-#if 0
-        {
-            constexpr auto HoPerThreadx2 = HoPerThread * 2;
-            constexpr auto WoPerThreadx2 = WoPerThread * 2;
-            const index_t hox2_block_data_on_global = ho_block_work_id * HoPerBlock * 2;
-            const index_t wox2_block_data_on_global = wo_block_work_id * WoPerBlock * 2;
-            const index_t hox2_thread_data_on_global =
-                hox2_block_data_on_global + ho_thread_id * HoPerThreadx2;
-            const index_t wox2_thread_data_on_global =
-                wox2_block_data_on_global + wo_thread_id * WoPerThreadx2;
-            static_assert(KPerThread % CThreadTransferDstScalarPerVector == 0, "");
-            constexpr auto KPerThreadAdd = KPerThread / CThreadTransferDstScalarPerVector;
-            const index_t k_block_data_on_global_add =
-                k_block_work_id * KPerBlock / CThreadTransferDstScalarPerVector;
-            const index_t k_thread_data_on_global_add =
-                k_block_data_on_global_add + k_thread_id * KPerThreadAdd;
-            constexpr auto d_k_n_hox2_wox2_thread_desc =
-                make_dynamic_naive_tensor_descriptor_packed_v2(
-                    make_tuple(Number<1>{}, Number<1>{}, Number<1>{}, Number<1>{}));
-            constexpr auto vector_len = CThreadTransferDstScalarPerVector;
-            constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
-            vector_type<int8_t, vector_len> d_vec;
-            for(index_t k_i = 0; k_i < KPerThreadAdd; ++k_i)
-            {
-                for(index_t h_i = 0; h_i < HoPerThreadx2; ++h_i)
-                {
-                    for(index_t w_i = 0; w_i < WoPerThreadx2; ++w_i)
-                    {
-                        ThreadwiseDynamicTensorSliceTransfer_v2<
-                            FloatC,
-                            decltype(d_vec),
-                            decltype(d_k_n_hox2_wox2_global_desc),
-                            decltype(d_k_n_hox2_wox2_thread_desc),
-                            Sequence<1, 1, 1, 1>,
-                            CThreadTransferSrcDstAccessOrder,
-                            CThreadTransferSrcDstVectorDim,
-                            // CThreadTransferDstScalarPerVector,
-                            1,
-                            AddressSpace::Global,
-                            AddressSpace::Vgpr,
-                            InMemoryDataOperation::Set,
-                            1,
-                            true>(d_k_n_hox2_wox2_global_desc,
-                                  make_multi_index(k_thread_data_on_global_add + k_i,
-                                                   0,
-                                                   hox2_thread_data_on_global + h_i,
-                                                   wox2_thread_data_on_global + w_i))
-                            .Run2(d_k_n_hox2_wox2_global_desc,
-                                  p_d_global,
-                                  d_k_n_hox2_wox2_thread_desc,
-                                  make_tuple(I0, I0, I0, I0),
-                                  d_vec,
-                                  c_k_n_ho_wo_global_tensor_iterator_hacks);
-                        static_for<0, vector_len, 1>{}([&](auto i) {
-                            d_vec.template AsType<int8_t>()(i) +=
-                                p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset(
-                                    make_tuple(k_i * vector_len + i, 0, h_i / 2, w_i / 2))];
-                        });
-                        ThreadwiseDynamicTensorSliceTransfer_v1r3<
-                            decltype(d_vec),
-                            FloatC,
-                            decltype(d_k_n_hox2_wox2_thread_desc),
-                            decltype(d_k_n_hox2_wox2_global_desc),
-                            Sequence<1, 1, 1, 1>,
-                            CThreadTransferSrcDstAccessOrder,
-                            CThreadTransferSrcDstVectorDim,
-                            // CThreadTransferDstScalarPerVector,
-                            1,
-                            AddressSpace::Vgpr,
-                            AddressSpace::Global,
-                            CGlobalMemoryDataOperation,
-                            1,
-                            true>(d_k_n_hox2_wox2_global_desc,
-                                  make_multi_index(k_thread_data_on_global_add + k_i,
-                                                   0,
-                                                   hox2_thread_data_on_global + h_i,
-                                                   wox2_thread_data_on_global + w_i))
-                            .Run2(d_k_n_hox2_wox2_thread_desc,
-                                  make_tuple(I0, I0, I0, I0),
-                                  d_vec,
-                                  d_k_n_hox2_wox2_global_desc,
-                                  p_c_global,
-                                  c_k_n_ho_wo_global_tensor_iterator_hacks);
-                    }
-                }
-            }
-        }
-#else
        {
            static_for<0, KPerThreadAdd, 1>{}([&](auto k_i) {
                static_for<0, HoPerThreadx2, 1>{}([&](auto h_i) {
@@ -569,7 +464,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
                     p_c_global,
                     c_k_n_ho_wo_global_tensor_iterator_hacks);
        }
-#endif
    }
    // pass tensor descriptor by reference

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
@@ -109,6 +109,29 @@ struct ThreadwiseGemm_km_kn_mn_v3
                                                   p_c[c_offset_2],
                                                   p_c[c_offset_3]);
                }
+                else if constexpr(H == 1 && W == 4)
+                {
+                    constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
+                    constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
+                    constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 2));
+                    constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 3));
+                    constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
+                    constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
+                    constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 2));
+                    constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 3));
+                    amd_assembly_outer_product_1x4(p_a[a_offset],
+                                                   p_b[b_offset_0],
+                                                   p_b[b_offset_1],
+                                                   p_b[b_offset_2],
+                                                   p_b[b_offset_3],
+                                                   p_c[c_offset_0],
+                                                   p_c[c_offset_1],
+                                                   p_c[c_offset_2],
+                                                   p_c[c_offset_3]);
+                }
                else
                {
                    static_for<0, H, 1>{}([&](auto h) {

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -142,8 +142,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
    constexpr index_t EPerBlock  = C0;
    constexpr index_t KPerThread  = KPerBlock;
-    constexpr index_t HoPerThread = 2;
+    constexpr index_t HoPerThread = 4;
-    constexpr index_t WoPerThread = 2;
+    constexpr index_t WoPerThread = 1;
    constexpr index_t EPerThread  = EPerBlock;
    using ABlockTransferThreadSliceLengths_E_K   = Sequence<9, 1>;
@@ -184,7 +184,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
 #endif
    constexpr auto conv_driver =
-#if 0
+#if 1
        DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
 #else
        DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -95,7 +95,7 @@ int main(int argc, char* argv[])
 #elif 1
    constexpr index_t N  = 1;
    constexpr index_t C  = 16;
-    constexpr index_t HI = 540;
+    constexpr index_t HI = 544;
    constexpr index_t WI = 960;
    constexpr index_t K  = 16;
    constexpr index_t Y  = 3;
@@ -118,6 +118,20 @@ int main(int argc, char* argv[])
    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t HI = 135;
+    constexpr index_t WI = 240;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
    using LeftPads  = Sequence<1, 1>;
    using RightPads = Sequence<1, 1>;
 #elif 0