conv-only use v5r1_add

26c42b94 · Jing Zhang · 4eb9a7a4 · 26c42b94 · 26c42b94 · 26c42b94
Commit 26c42b94 authored Oct 15, 2021 by Jing Zhang
7 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
@@ -133,11 +133,56 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_dlops_v2_add(const FloatAB* __restrict__ p_a_grid,
+        kernel_gemm_dlops_v2_resize_add(const FloatAB* __restrict__ p_a_grid,
-                                 const FloatAB* __restrict__ p_b_grid,
+                                        const FloatAB* __restrict__ p_b_grid,
-                                 const FloatC* __restrict__ p_bias_grid,
+                                        const FloatC* __restrict__ p_bias_grid,
-                                 FloatC* __restrict__ p_c_grid,
+                                        FloatC* __restrict__ p_d_grid)
-                                 FloatC* __restrict__ p_d_grid)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+    __shared__ FloatAB p_shared_block[shared_block_size];
+    constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{};
+    constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+        BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{};
+    constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2{};
+    constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx{};
+    constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+        CBlockIdToBlockClusterAdaptor_K_N_H_W{};
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_bias_grid,
+                      nullptr,
+                      p_d_grid,
+                      p_shared_block,
+                      a_e0_e1_k0_k1_e2_grid_desc,
+                      b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                      c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                      d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                      c_blockid_to_k_n_h_w_block_cluster_adaptor,
+                      integral_constant<bool, HasMainE0BlockLoop>{});
+}
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v2_maxpool(const FloatAB* __restrict__ p_a_grid,
+                                     const FloatAB* __restrict__ p_b_grid,
+                                     const FloatC* __restrict__ p_bias_grid,
+                                     FloatC* __restrict__ p_c_grid,
+                                     FloatC* __restrict__ p_d_grid)
 {
    constexpr index_t shared_block_size =
        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
@@ -165,6 +210,49 @@ __global__ void
                      c_blockid_to_k_n_h_w_block_cluster_adaptor,
                      integral_constant<bool, HasMainE0BlockLoop>{});
 }
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v2(const FloatAB* __restrict__ p_a_grid,
+                             const FloatAB* __restrict__ p_b_grid,
+                             const FloatC* __restrict__ p_bias_grid,
+                             FloatC* __restrict__ p_c_grid)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+    __shared__ FloatAB p_shared_block[shared_block_size];
+    constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{};
+    constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+        BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{};
+    constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2{};
+    constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+        CBlockIdToBlockClusterAdaptor_K_N_H_W{};
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_bias_grid,
+                      p_c_grid,
+                      nullptr,
+                      p_shared_block,
+                      a_e0_e1_k0_k1_e2_grid_desc,
+                      b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                      c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                      c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                      c_blockid_to_k_n_h_w_block_cluster_adaptor,
+                      integral_constant<bool, HasMainE0BlockLoop>{});
+}
 #endif
 template <index_t BlockSize,
@@ -209,6 +297,8 @@ template <index_t BlockSize,
          typename AGlobalMoveSliceWindowStepHacks,
          typename BGlobalMoveSliceWindowStepHacks,
          index_t activ_type = 0,
+          index_t bias_type  = 0,
+          index_t out_type   = 1,
          index_t add_type   = 0>
 struct GridwiseGemmDlops_km_kn_mn_v3_add
 {
@@ -426,13 +516,13 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
        const auto K1 = Number<KPerBlock>{};
        const auto K0 = K / K1;
-        const auto H2 = HoPerThread * 2;
+        const auto H2 = Number<HoPerThread * 2>{};
        const auto H1 = Number<HoPerBlock / HoPerThread>{};
-        const auto H0 = Hx / (H1 * H2);
+        const auto H0 = Number<Hx / (H1 * H2)>{};
-        const auto W2 = WoPerThread * 2;
+        const auto W2 = Number<WoPerThread * 2>{};
        const auto W1 = Number<WoPerBlock / WoPerThread>{};
-        const auto W0 = Wx / (W1 * W2);
+        const auto W0 = Number<Wx / (W1 * W2)>{};
        const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor(
            d_k_n_hx_wx_grid_desc,
@@ -449,11 +539,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
    __host__ __device__ static constexpr auto
    MakeDK0K1NH0H1HxW0W1WxGridDescriptor(const DGridDesc_K_N_Hx_Wx& d_k_n_hx_wx_grid_desc)
    {
-        if constexpr(add_type == 0)
+        if constexpr(add_type == 1)
        {
            return MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd(d_k_n_hx_wx_grid_desc);
        }
-        else if constexpr(add_type == 1)
+        else if constexpr(add_type == 2)
        {
            return MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(d_k_n_hx_wx_grid_desc);
        }
@@ -544,9 +634,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
        constexpr auto HasMainE1BlockLoop       = CalculateHasMainE1BlockLoop();
        constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop();
-        // const auto Ho = b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetLength(I3);
-        // const auto Wo = b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetLength(I4);
        const auto c_k_n_h_w_block_cluster_idx =
            c_blockid_to_k_n_h_w_block_cluster_adaptor.CalculateBottomIndex(
                make_multi_index(get_block_1d_id()));
@@ -909,6 +996,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
        }
        // activ
+        if constexpr(activ_type > 0)
        {
            static_for<0, c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(), 1>{}([&](auto i) {
                if constexpr(activ_type == 1)
@@ -929,7 +1017,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
            });
        }
-        // Bias
+        if constexpr(bias_type == 1)
        {
            constexpr auto bias_k0_k1_thread_desc =
                make_naive_tensor_descriptor_packed(make_tuple(I1, Number<KPerThread>{}));
@@ -982,6 +1070,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
 #if 1
        // Output
+        if constexpr(out_type == 1)
        {
            // hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global
            // tensor
@@ -1032,7 +1121,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
 #if 1
        // Resize_Add
-        if constexpr(add_type == 0)
+        if constexpr(add_type == 1)
        {
            constexpr auto HoPerThreadx2 = HoPerThread * 2;
            constexpr auto WoPerThreadx2 = WoPerThread * 2;
@@ -1145,7 +1234,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
                     d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks);
        }
        // MaxPool
-        else if constexpr(add_type == 1)
+        else if constexpr(add_type == 2)
        {
            static_assert(HoPerThread % 2 == 0 && WoPerThread % 2 == 0, "");

--- a/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
@@ -27,7 +27,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
    const Tensor<TInWei>& in_n_c0_hi_wi_c1,
    const Tensor<TInWei>& wei_k_c0_y_x_c1,
    const Tensor<TOut>& bias_k0_k1,
-    Tensor<TOut>& out_n_k0_ho_wo_k1,
    const Tensor<TOut>& add_n_k0_hox2_wox2_k1,
    Tensor<TOut>& add_n_k0_hox2_wox2_k1_out,
    ck::index_t nrepeat)
@@ -64,8 +63,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
    DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
-    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
-                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
    DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) *
                                               add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace());
@@ -191,7 +188,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
                                in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
                            static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
-                            static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
                            static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
                            nrepeat);
@@ -219,10 +215,8 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
                        in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
                    static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
-                    static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
                    static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
                    0);
-    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
    add_n_k0_hox2_wox2_k1_device_buf.FromDevice(add_n_k0_hox2_wox2_k1_out.mData.data());
 }
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
@@ -132,10 +132,6 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
    const auto out_n_k0_ho_wo_k1_desc =
        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
-    static_assert(in_n_c0_hi_wi_c1_desc.IsKnownAtCompileTime(), "");
-    static_assert(wei_k_c0_y_x_c1_desc.IsKnownAtCompileTime(), "");
-    static_assert(out_n_k0_ho_wo_k1_desc.IsKnownAtCompileTime(), "");
    constexpr auto conv_driver =
        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad<
            BlockSize,

--- a/host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
@@ -49,7 +49,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                       const FloatAB* __restrict__ p_a_grid,
                       const FloatAB* __restrict__ p_b_grid,
                       const FloatC* __restrict__ p_bias_grid,
-                       FloatC* __restrict__ p_c_grid,
                       FloatC* __restrict__ p_d_grid,
                       const int nrepeat) const
    {
@@ -338,7 +337,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
            decltype(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks),
            decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack),
-            activ_type>;
+            activ_type,
+            1, // bias_type
+            0, // out_type
+            1  // add_type
+            >;
        const auto a_e0_e1_k0_k1_e2_grid_desc =
            GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
@@ -374,7 +377,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
        if(has_main_e0_block_loop)
        {
-            const auto kernel = kernel_gemm_dlops_v2_add<
+            const auto kernel = kernel_gemm_dlops_v2_resize_add<
                GridwiseGemm,
                FloatAB,
                FloatC,
@@ -393,7 +396,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                                              p_a_grid,
                                              p_b_grid,
                                              p_bias_grid,
-                                              p_c_grid,
                                              p_d_grid,
                                              a_e0_e1_k0_k1_e2_grid_desc,
                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
@@ -403,7 +405,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
        }
        else
        {
-            const auto kernel = kernel_gemm_dlops_v2_add<
+            const auto kernel = kernel_gemm_dlops_v2_resize_add<
                GridwiseGemm,
                FloatAB,
                FloatC,
@@ -422,7 +424,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                                              p_a_grid,
                                              p_b_grid,
                                              p_bias_grid,
-                                              p_c_grid,
                                              p_d_grid,
                                              a_e0_e1_k0_k1_e2_grid_desc,
                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
@@ -455,7 +456,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
        if(has_main_e0_block_loop)
        {
-            const auto kernel = kernel_gemm_dlops_v2_add<
+            const auto kernel = kernel_gemm_dlops_v2_resize_add<
                GridwiseGemm,
                FloatAB,
                FloatC,
@@ -475,7 +476,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                p_a_grid,
                p_b_grid,
                p_bias_grid,
-                p_c_grid,
                p_d_grid,
                cast_pointer_to_constant_address_space(
                    a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
@@ -490,8 +490,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
        }
        else
        {
+            const auto kernel = kernel_gemm_dlops_v2_resize_add<
-            const auto kernel = kernel_gemm_dlops_v2_add<
                GridwiseGemm,
                FloatAB,
                FloatC,
@@ -511,7 +510,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                p_a_grid,
                p_b_grid,
                p_bias_grid,
-                p_c_grid,
                p_d_grid,
                cast_pointer_to_constant_address_space(
                    a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
@@ -524,7 +522,35 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                cast_pointer_to_constant_address_space(
                    c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
        }
+#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        {
+            static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
+            const auto kernel = kernel_gemm_dlops_v2_resize_add<
+                GridwiseGemm,
+                FloatAB,
+                FloatC,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                has_main_e0_block_loop>;
+            ave_time = launch_and_time_kernel(kernel,
+                                              nrepeat,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              p_a_grid,
+                                              p_b_grid,
+                                              p_bias_grid,
+                                              p_d_grid);
+        }
 #endif
        return ave_time;
    }

--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
@@ -4,7 +4,7 @@
 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v2.hpp"
+#include "gridwise_gemm_dlops_v2_add.hpp"
 template <ck::index_t BlockSize,
          typename FloatAB,
@@ -259,7 +259,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
        static_assert(c_k_n_hop_wop_grid_desc.IsKnownAtCompileTime(), "");
        // GEMM
-        using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
+        using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3_add<
            BlockSize,
            FloatAB,
            FloatAcc,
@@ -268,6 +268,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
            decltype(a_e0_e1_k_e2_grid_desc),
            decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
            decltype(c_k_n_hop_wop_grid_desc),
+            decltype(c_k_n_hop_wop_grid_desc),
            E1,
            E2,
            K2,
@@ -298,9 +299,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
            decltype(a_e0_e1_k_e2_global_step_hacks),
            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
+            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
            decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack),
-            activ_type>;
+            activ_type,
+            0, // bias_type
+            1, // out_type
+            0  // add_type
+            >;
        const auto a_e0_e1_k0_k1_e2_grid_desc =
            GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
@@ -469,7 +475,33 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
                cast_pointer_to_constant_address_space(
                    c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
        }
+#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        {
+            static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
+            const auto kernel =
+                kernel_gemm_dlops_v2<GridwiseGemm,
+                                     FloatAB,
+                                     FloatC,
+                                     remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                                     remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                                     remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                                     remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                                     has_main_e0_block_loop>;
+            ave_time = launch_and_time_kernel(kernel,
+                                              nrepeat,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              p_a_grid,
+                                              p_b_grid,
+                                              p_bias_grid,
+                                              p_c_grid);
+        }
 #endif
        return ave_time;
    }

--- a/host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
@@ -323,8 +323,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
            9,
            BThreadTransferSrcScalarPerVector_E2,
-            false, // don't move back src coordinate after threadwise copy, which will be fused with
+            false, // don't move back src coordinate after threadwise copy, which will be fused
-                   // MoveSrcSliceWindow() to save addr computation
+                   // with MoveSrcSliceWindow() to save addr computation
            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
            1,
            CThreadTransferDstScalarPerVector_K,
@@ -335,7 +335,10 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
            decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack),
            activ_type,
-            1>;
+            1, // bias_type
+            1, // out_type
+            2  // add_type
+            >;
        const auto a_e0_e1_k0_k1_e2_grid_desc =
            GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
@@ -528,16 +531,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
            static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
            static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
-            const auto kernel =
+            const auto kernel = kernel_gemm_dlops_v2_maxpool<
-                kernel_gemm_dlops_v2_add<GridwiseGemm,
+                GridwiseGemm,
-                                         FloatAB,
+                FloatAB,
-                                         FloatC,
+                FloatC,
-                                         remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
-                                         remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
-                                         remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
-                                         remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
-                                         remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
-                                         has_main_e0_block_loop>;
+                has_main_e0_block_loop>;
            ave_time = launch_and_time_kernel(kernel,
                                              nrepeat,

--- a/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
@@ -214,7 +214,6 @@ int main(int argc, char* argv[])
    Tensor<in_data_t> in(in_lengths_host);
    Tensor<in_data_t> wei(wei_lengths_host);
    Tensor<in_data_t> add(add_lengths_host);
-    Tensor<out_data_t> out_device(out_lengths_host);
    Tensor<in_data_t> add_device(add_lengths_host);
    Tensor<in_data_t> add_host(add_lengths_host);
    Tensor<out_data_t> bias(bias_lengths_host);
@@ -308,7 +307,6 @@ int main(int argc, char* argv[])
            in,
            wei,
            bias,
-            out_device,
            add,
            add_device,
            nrepeat);