init commit for conv+activ

fe427fd1 · Jing Zhang · b8bb1480 · fe427fd1 · fe427fd1 · fe427fd1
Commit fe427fd1 authored Jul 23, 2021 by Jing Zhang
6 changed files
--- a/composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+++ b/composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
@@ -173,8 +173,8 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
            make_tuple(make_merge_transform(make_tuple(K0, K1)),
                       make_pass_through_transform(N),
-                       make_pad_transform(Ho, I0, OutRightPadH),
-                       make_pad_transform(Wo, I0, OutRightPadW)),
+                       make_right_pad_transform(Ho, OutRightPadH),
+                       make_right_pad_transform(Wo, OutRightPadW)),
            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));

@@ -387,5 +387,714 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
        }
    }
 };
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          index_t ABlockTransferSrcScalarPerVector_E,
+          index_t ABlockTransferDstScalarPerVector_K,
+          index_t BThreadTransferSrcScalarPerVector_W,
+          index_t CThreadTransferDstScalarPerVector_W>
+struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad_1x1
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                      const ConvStrides& conv_strides,
+                      const ConvDilations& conv_dilations,
+                      const InLeftPads& in_left_pads,
+                      const InRightPads& in_right_pads_,
+                      const FloatAB* __restrict__ p_wei_global,
+                      const FloatAB* __restrict__ p_in_global,
+                      FloatC* __restrict__ p_out_global) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+
+        const auto N_ = in_n_c_hi_wi_global_desc.GetLength(I0);
+        const auto C_ = in_n_c_hi_wi_global_desc.GetLength(I1);
+
+        const auto Hi_ = in_n_c_hi_wi_global_desc.GetLength(I2);
+        const auto Wi_ = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+        const auto K0_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+        const auto Ho_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+        const auto K1_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+
+        const auto K_ = wei_k_c_y_x_global_desc.GetLength(I0);
+        const auto Y_ = wei_k_c_y_x_global_desc.GetLength(I2);
+        const auto X_ = wei_k_c_y_x_global_desc.GetLength(I3);
+
+        constexpr auto N  = Number<N_>{};
+        constexpr auto C  = Number<C_>{};
+        constexpr auto K0 = Number<K0_>{};
+        constexpr auto K1 = Number<K1_>{};
+
+        constexpr auto Hi = Number<Hi_>{};
+        constexpr auto Wi = Number<Wi_>{};
+
+        constexpr auto Ho = Number<Ho_>{};
+        constexpr auto Wo = Number<Wo_>{};
+
+        constexpr auto K = Number<K_>{};
+        constexpr auto Y = Number<Y_>{};
+        constexpr auto X = Number<X_>{};
+
+        const auto ConvStrideH_ = conv_strides[I0];
+        const auto ConvStrideW_ = conv_strides[I1];
+
+        const auto ConvDilationH_ = conv_dilations[I0];
+        const auto ConvDilationW_ = conv_dilations[I1];
+
+        constexpr auto ConvStrideH = Number<ConvStrideH_>{};
+        constexpr auto ConvStrideW = Number<ConvStrideW_>{};
+
+        constexpr auto ConvDilationH = Number<ConvDilationH_>{};
+        constexpr auto ConvDilationW = Number<ConvDilationW_>{};
+
+        constexpr auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
+        constexpr auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
+
+        constexpr auto OutRightPadH = Hop - Ho;
+        constexpr auto OutRightPadW = Wop - Wo;
+
+        const auto InLeftPadH_ = in_left_pads[I0];
+        const auto InLeftPadW_ = in_left_pads[I1];
+
+        constexpr auto InLeftPadH = Number<InLeftPadH_>{};
+        constexpr auto InLeftPadW = Number<InLeftPadW_>{};
+
+        static_assert(InLeftPadH == 0 and InLeftPadW == 0, "");
+
+        constexpr auto in_right_pads = InRightPads{};
+
+        const auto InRightPadH_ = in_right_pads[I0] + OutRightPadH * ConvStrideH;
+        const auto InRightPadW_ = in_right_pads[I1] + OutRightPadW * ConvStrideW;
+
+        constexpr auto InRightPadH = Number<InRightPadH_>{};
+        constexpr auto InRightPadW = Number<InRightPadW_>{};
+
+        std::cerr << "OutRightPadH = " << OutRightPadH << " OutRightPadW = " << OutRightPadW
+                  << std::endl;
+        std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
+                  << std::endl;
+
+        // weight tensor
+        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
+            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+        static_assert(wei_e_k_global_desc.IsKnownAtCompileTime(),
+                      "wrong! wei_e_k_global_desc need to known at compile-time");
+
+        // input tensor
+        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(make_pass_through_transform(C),
+                       make_pass_through_transform(N),
+                       make_right_pad_transform(Hi, InRightPadH),
+                       make_right_pad_transform(Wi, InRightPadW)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(in_e_n_ho_wo_global_desc.IsKnownAtCompileTime(),
+                      "wrong! in_e_n_ho_wo_global_desc need to known at compile-time");
+
+        // output tensor
+        const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_right_pad_transform(Ho, OutRightPadH),
+                       make_right_pad_transform(Wo, OutRightPadW)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(out_k_n_hop_wop_global_desc.IsKnownAtCompileTime(),
+                      "wrong! out_k_n_hop_wop_global_desc need to known at compile-time");
+
+        const auto E = C;
+
+        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
+
+        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
+             (E % EPerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+
+        // hack to control index calculation when iterating over a_k_m_global tensor
+        constexpr auto a_e_k_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
+
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            Sequence<0, 0, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
+        // hack for NKHW format
+        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+
+        // GEMM
+        using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperation::Set,
+            decltype(wei_e_k_global_desc),
+            decltype(in_e_n_ho_wo_global_desc),
+            decltype(out_k_n_hop_wop_global_desc),
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 2, 3, 1>,
+            3,
+            BThreadTransferSrcScalarPerVector_W,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 2, 3, 1>,
+            0,
+            CThreadTransferDstScalarPerVector_W,
+            decltype(a_e_k_global_iterator_hacks),
+            decltype(b_e_n_ho_wo_global_iterator_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
+            decltype(a_e_k_global_move_slice_window_iterator_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+
+        const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
+
+        constexpr bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
+
+        constexpr bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
+
+        index_t nrepeat = 100;
+
+        std::cout << "conv_v5r1__NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi
+                  << "-k" << K << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0]
+                  << "v" << conv_strides[I1] << "l" << conv_dilations[I0] << "j"
+                  << conv_dilations[I1] << "q" << in_left_pads[I0] << "p" << in_right_pads[I0]
+                  << std::endl;
+
+        std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;
+
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+            KernelTimer timer;
+            timer.Start();
+            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
+                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
+                      << std::endl;
+
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if constexpr(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if constexpr(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if constexpr(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+
+            timer.End();
+
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+
+            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                            wei_k_c_y_x_global_desc,
+                                                            out_n_k0_ho_wo_k1_global_desc) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+};
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          index_t ABlockTransferSrcScalarPerVector_E,
+          index_t ABlockTransferDstScalarPerVector_K,
+          index_t BThreadTransferSrcScalarPerVector_W,
+          index_t CThreadTransferDstScalarPerVector_W>
+struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_1x1
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                      const ConvStrides& conv_strides,
+                      const ConvDilations& conv_dilations,
+                      const InLeftPads& in_left_pads,
+                      const InRightPads& in_right_pads_,
+                      const FloatAB* __restrict__ p_wei_global,
+                      const FloatAB* __restrict__ p_in_global,
+                      FloatC* __restrict__ p_out_global) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+
+        const auto N_ = in_n_c_hi_wi_global_desc.GetLength(I0);
+        const auto C_ = in_n_c_hi_wi_global_desc.GetLength(I1);
+
+        const auto Hi_ = in_n_c_hi_wi_global_desc.GetLength(I2);
+        const auto Wi_ = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+        const auto K0_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+        const auto Ho_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+        const auto K1_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+
+        const auto K_ = wei_k_c_y_x_global_desc.GetLength(I0);
+        const auto Y_ = wei_k_c_y_x_global_desc.GetLength(I2);
+        const auto X_ = wei_k_c_y_x_global_desc.GetLength(I3);
+
+        constexpr auto N  = Number<N_>{};
+        constexpr auto C  = Number<C_>{};
+        constexpr auto K0 = Number<K0_>{};
+        constexpr auto K1 = Number<K1_>{};
+
+        constexpr auto Hi = Number<Hi_>{};
+        constexpr auto Wi = Number<Wi_>{};
+
+        constexpr auto Ho = Number<Ho_>{};
+        constexpr auto Wo = Number<Wo_>{};
+
+        constexpr auto K = Number<K_>{};
+        constexpr auto Y = Number<Y_>{};
+        constexpr auto X = Number<X_>{};
+
+        const auto ConvStrideH_ = conv_strides[I0];
+        const auto ConvStrideW_ = conv_strides[I1];
+
+        const auto ConvDilationH_ = conv_dilations[I0];
+        const auto ConvDilationW_ = conv_dilations[I1];
+
+        constexpr auto ConvStrideH = Number<ConvStrideH_>{};
+        constexpr auto ConvStrideW = Number<ConvStrideW_>{};
+
+        constexpr auto ConvDilationH = Number<ConvDilationH_>{};
+        constexpr auto ConvDilationW = Number<ConvDilationW_>{};
+
+        constexpr auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
+        constexpr auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
+
+        constexpr auto OutRightPadH = Hop - Ho;
+        constexpr auto OutRightPadW = Wop - Wo;
+
+        const auto InLeftPadH_ = in_left_pads[I0];
+        const auto InLeftPadW_ = in_left_pads[I1];
+
+        constexpr auto InLeftPadH = Number<InLeftPadH_>{};
+        constexpr auto InLeftPadW = Number<InLeftPadW_>{};
+
+        static_assert(InLeftPadH == 0 and InLeftPadW == 0, "");
+
+        constexpr auto in_right_pads = InRightPads{};
+
+        const auto InRightPadH_ = in_right_pads[I0] + OutRightPadH * ConvStrideH;
+        const auto InRightPadW_ = in_right_pads[I1] + OutRightPadW * ConvStrideW;
+
+        constexpr auto InRightPadH = Number<InRightPadH_>{};
+        constexpr auto InRightPadW = Number<InRightPadW_>{};
+
+        static_assert(OutRightPadW == 0 and OutRightPadH == 0, "");
+
+        // weight tensor
+        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
+            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+        static_assert(wei_e_k_global_desc.IsKnownAtCompileTime(),
+                      "wrong! wei_e_k_global_desc need to known at compile-time");
+
+        // input tensor
+        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(make_pass_through_transform(C),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Ho),
+                       make_pass_through_transform(Wo)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(in_e_n_ho_wo_global_desc.IsKnownAtCompileTime(),
+                      "wrong! in_e_n_ho_wo_global_desc need to known at compile-time");
+
+        // output tensor
+        const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Ho),
+                       make_pass_through_transform(Wo)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(out_k_n_hop_wop_global_desc.IsKnownAtCompileTime(),
+                      "wrong! out_k_n_hop_wop_global_desc need to known at compile-time");
+
+        const auto E = C;
+
+        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
+
+        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
+             (E % EPerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+
+        // hack to control index calculation when iterating over a_k_m_global tensor
+        constexpr auto a_e_k_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
+
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            Sequence<0, 0, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
+        // hack for NKHW format
+        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+
+        // GEMM
+        using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperation::Set,
+            decltype(wei_e_k_global_desc),
+            decltype(in_e_n_ho_wo_global_desc),
+            decltype(out_k_n_hop_wop_global_desc),
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 2, 3, 1>,
+            3,
+            BThreadTransferSrcScalarPerVector_W,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 2, 3, 1>,
+            0,
+            CThreadTransferDstScalarPerVector_W,
+            decltype(a_e_k_global_iterator_hacks),
+            decltype(b_e_n_ho_wo_global_iterator_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
+            decltype(a_e_k_global_move_slice_window_iterator_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+
+        const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
+
+        constexpr bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
+
+        constexpr bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
+
+        index_t nrepeat = 100;
+
+        std::cout << "conv_v5r1_NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi
+                  << "-k" << K << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0]
+                  << "v" << conv_strides[I1] << "l" << conv_dilations[I0] << "j"
+                  << conv_dilations[I1] << "q" << in_left_pads[I0] << "p" << in_right_pads[I0]
+                  << std::endl;
+
+        std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;
+
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+            KernelTimer timer;
+            timer.Start();
+            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
+                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
+                      << std::endl;
+
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if constexpr(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if constexpr(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if constexpr(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+
+            timer.End();
+
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+
+            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                            wei_k_c_y_x_global_desc,
+                                                            out_n_k0_ho_wo_k1_global_desc) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+};
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
@@ -32,7 +32,7 @@ __host__ __device__ constexpr auto make_left_pad_transform(
    return DynamicLeftPad<LowLength, LeftPad, SkipIsValidCheck>{low_length, left_pad};
 }

-template <typename LowLength, typename RightPad, bool SkipIsValidCheck>
+template <typename LowLength, typename RightPad, bool SkipIsValidCheck = false>
 __host__ __device__ constexpr auto make_right_pad_transform(
    const LowLength& low_length,
    const RightPad& right_pad,

--- a/composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
@@ -346,6 +346,19 @@ struct GridwiseStaticGemm_km_kn_mn_v3
            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
        }

+        // activ
+        {
+            constexpr index_t activ_type = 2;
+            static_for<0, c_k_n_ho_wo_thread_desc.GetElementSpaceSize(), 1>{}([&](auto i) {
+                if constexpr(activ_type == 1)
+                    c_thread_buf(i) = c_thread_buf[i] >= 0 ? c_thread_buf[i] : 0.0;
+                else if constexpr(activ_type == 2)
+                    c_thread_buf(i) = 1.0 / (1.0 + exp(-c_thread_buf[i]));
+            }
+
+            );
+        }
+
        // output: register to global memory
        {
            // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor

--- a/host/driver_offline/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/conv_fwd_driver_offline.cpp
@@ -103,14 +103,14 @@ int main(int argc, char* argv[])
    const bool do_log             = atoi(argv[5]);
    const int nrepeat             = atoi(argv[6]);

-#if 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 1080;
-    constexpr index_t Wi = 1920;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
+#if 1
+    constexpr index_t N           = 1;
+    constexpr index_t C           = 16;
+    constexpr index_t Hi          = 1080;
+    constexpr index_t Wi          = 1920;
+    constexpr index_t K           = 16;
+    constexpr index_t Y           = 3;
+    constexpr index_t X           = 3;
 #elif 0
    constexpr index_t N  = 1;
    constexpr index_t C  = 16;
@@ -127,7 +127,7 @@ int main(int argc, char* argv[])
    constexpr index_t K  = 16;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
-#elif 1
+#elif 0
    constexpr index_t N  = 1;
    constexpr index_t C  = 16;
    constexpr index_t Hi = 240;
@@ -135,7 +135,7 @@ int main(int argc, char* argv[])
    constexpr index_t K  = 16;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
-#elif 1
+#elif 0
    constexpr index_t N  = 1;
    constexpr index_t C  = 16;
    constexpr index_t Hi = 1080;
@@ -143,6 +143,38 @@ int main(int argc, char* argv[])
    constexpr index_t K  = 16;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 540;
+    constexpr index_t Wi = 960;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 480;
+    constexpr index_t Wi = 270;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 8;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 4;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
 #endif

    const index_t conv_stride_h   = 1;
@@ -420,17 +452,17 @@ int main(int argc, char* argv[])
 #else
        device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
 #endif
-            <in_data_t, 8, acc_data_t, out_data_t>(tmp[I0],
-                                                   tmp[I1],
-                                                   tmp[I2],
-                                                   tmp[I3],
-                                                   tmp[I4],
-                                                   tmp[I5],
-                                                   tmp[I6],
-                                                   in,
-                                                   wei,
-                                                   out_device,
-                                                   nrepeat);
+            <in_data_t, 8, 8, acc_data_t, out_data_t>(tmp[I0],
+                                                      tmp[I1],
+                                                      tmp[I2],
+                                                      tmp[I3],
+                                                      tmp[I4],
+                                                      tmp[I5],
+                                                      tmp[I6],
+                                                      in,
+                                                      wei,
+                                                      out_device,
+                                                      nrepeat);
    }
 #endif

@@ -490,14 +522,15 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        host_direct_convolution(in,
-                                wei,
-                                out_host,
-                                make_tuple(conv_stride_h, conv_stride_w),
-                                make_tuple(conv_dilation_h, conv_dilation_w),
-                                make_tuple(in_left_pad_h, in_left_pad_w),
-                                make_tuple(in_right_pad_h, in_right_pad_w),
-                                layout);
+        host_direct_convolution_activ(in,
+                                      wei,
+                                      out_host,
+                                      make_tuple(conv_stride_h, conv_stride_w),
+                                      make_tuple(conv_dilation_h, conv_dilation_w),
+                                      make_tuple(in_left_pad_h, in_left_pad_w),
+                                      make_tuple(in_right_pad_h, in_right_pad_w),
+                                      layout,
+                                      ActivType_t::sigmoid);

        check_error(out_host, out_device);


--- a/host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -6,6 +6,7 @@

 template <typename TInWei,
          ck::index_t InWeiVectorSize,
+          ck::index_t OutVectorSize,
          typename TAcc,
          typename TOut,
          typename InLengths,
@@ -53,8 +54,8 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
    const auto C0 = C / Number<InWeiVectorSize>{};
    const auto C1 = Number<InWeiVectorSize>{};

-    const auto K0 = K / Number<InWeiVectorSize>{};
-    const auto K1 = Number<InWeiVectorSize>{};
+    const auto K0 = K / Number<OutVectorSize>{};
+    const auto K1 = Number<OutVectorSize>{};

    Tensor<TInWei> in_n_c0_hi_wi_c1(
        HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
@@ -105,7 +106,7 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
    constexpr index_t WoPerThread = 2;
    constexpr index_t EPerThread  = EPerBlock;

-    using ABlockTransferThreadSliceLengths_E_K   = Sequence<9, 1>;
+    using ABlockTransferThreadSliceLengths_E_K   = Sequence<Y * X, 1>;
    using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, KPerBlock>;

    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
@@ -120,8 +121,10 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
    constexpr auto conv_driver =
 #if 0
        DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
-#else
+#elif 1
        DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
+#elif 1
+        DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad_1x1
 #endif
        <BlockSize,
         typename vector_type<TInWei, InWeiVectorSize>::type,

--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
 #pragma once
 #include "host_tensor.hpp"

+typedef enum
+{
+    passthrough = 0,
+    relu,
+    sigmoid
+} ActivType_t;
+
 template <typename TIn,
          typename TWei,
          typename TOut,
@@ -88,6 +95,106 @@ void host_direct_convolution(const Tensor<TIn>& in,
    }
 }

+template <typename T>
+inline auto activ(T v, const ActivType_t activ_type)
+{
+    switch(activ_type)
+    {
+    case passthrough: return v;
+    case relu: return (v >= 0 ? v : 0);
+    case sigmoid: return (1 / (1 + exp(-v)));
+    default: throw std::runtime_error("unsupported activ type"); break;
+    }
+}
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_activ(const Tensor<TIn>& in,
+                                   const Tensor<TWei>& wei,
+                                   Tensor<TOut>& out,
+                                   const ConvStrides& conv_strides,
+                                   const ConvDilations& conv_dilations,
+                                   const InLeftPads& in_left_pads,
+                                   const InRightPads& in_right_pads,
+                                   const ConvTensorLayout layout = ConvTensorLayout::NCHW,
+                                   const ActivType_t activ_type  = ActivType_t::passthrough)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        v += static_cast<const double>(in(n, c, hi, wi)) *
+                             static_cast<const double>(wei(k, c, y, x));
+                    }
+                }
+            }
+        }
+        out(n, k, ho, wo) = activ(v, activ_type);
+    };
+
+    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[2])
+                    {
+                        v += static_cast<const double>(in(n, hi, wi, c)) *
+                             static_cast<const double>(wei(k, y, x, c));
+                    }
+                }
+            }
+        }
+        out(n, k, ho, wo) = activ(v, activ_type);
+    };
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    case ConvTensorLayout::NHWC:
+        make_ParallelTensorFunctor(f_nhwc,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    default: throw std::runtime_error("wrong! not supported layout");
+    }
+}
+
 template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>
 void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
                                   const Tensor<TWei>& wei_kcyx,