init commit of conv+add

5ed51b71 · Jing Zhang · b53926e9 · 5ed51b71 · 5ed51b71 · 5ed51b71
Commit 5ed51b71 authored Jul 26, 2021 by Jing Zhang
11 changed files
--- a/composable_kernel/include/driver/driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+++ b/composable_kernel/include/driver/driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
--- a/composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+++ b/composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
@@ -226,7 +226,7 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
                                  Sequence<0, 0, 0, 0, 0>{}));

        // GEMM
-        using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v3<
+        using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v2<
            BlockSize,
            FloatAB,
            FloatAcc,
@@ -273,11 +273,10 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad

        index_t nrepeat = 100;

-        std::cout << "conv_v5r1__NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi
-                  << "-k" << K << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0]
-                  << "v" << conv_strides[I1] << "l" << conv_dilations[I0] << "j"
-                  << conv_dilations[I1] << "q" << in_left_pads[I0] << "p" << in_right_pads[I0]
-                  << std::endl;
+        std::cout << "NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi << "-k" << K
+                  << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0] << "v"
+                  << conv_strides[I1] << "l" << conv_dilations[I0] << "j" << conv_dilations[I1]
+                  << "q" << in_left_pads[I0] << "p" << in_right_pads[I0] << std::endl;

        std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;

@@ -990,11 +989,10 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_1x1

        index_t nrepeat = 100;

-        std::cout << "conv_v5r1_NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi
-                  << "-k" << K << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0]
-                  << "v" << conv_strides[I1] << "l" << conv_dilations[I0] << "j"
-                  << conv_dilations[I1] << "q" << in_left_pads[I0] << "p" << in_right_pads[I0]
-                  << std::endl;
+        std::cout << "NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi << "-k" << K
+                  << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0] << "v"
+                  << conv_strides[I1] << "l" << conv_dilations[I0] << "j" << conv_dilations[I1]
+                  << "q" << in_left_pads[I0] << "p" << in_right_pads[I0] << std::endl;

        std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;


--- a/composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
@@ -47,7 +47,7 @@ template <index_t BlockSize,
          typename CGlobalIteratorHacks,
          typename AGlobalMoveSliceWindowIteratorHacks,
          typename BGlobalMoveSliceWindowIteratorHacks>
-struct GridwiseStaticGemm_km_kn_mn_v3
+struct GridwiseStaticGemm_km_kn_mn_v2
 {
    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
    {
@@ -237,10 +237,12 @@ struct GridwiseStaticGemm_km_kn_mn_v3
            c_thread_buf;

        // initialize output thread tensor
+#if 0
        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
            decltype(c_k_n_ho_wo_thread_desc),
            Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
        .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
+#endif

        constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);


--- a/composable_kernel/include/tensor_operation/gridwise_static_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_static_gemm_v3.hpp
--- a/host/driver_offline/CMakeLists.txt
+++ b/host/driver_offline/CMakeLists.txt
@@ -13,9 +13,15 @@ include_directories(BEFORE

 set(CONV_FWD_DRIVER_OFFLINE_SOURCE conv_fwd_driver_offline.cpp)
 set(CONV_BWD_DRIVER_OFFLINE_SOURCE conv_bwd_driver_offline.cpp)
+set(CONV_ADD_FWD_DRIVER_OFFLINE_SOURCE conv_add_fwd_driver_offline.cpp)
+set(CONV_ACTIV_FWD_DRIVER_OFFLINE_SOURCE conv_activ_fwd_driver_offline.cpp)

 add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
 add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
+add_executable(conv_add_fwd_driver_offline ${CONV_ADD_FWD_DRIVER_OFFLINE_SOURCE})
+add_executable(conv_activ_fwd_driver_offline ${CONV_ACTIV_FWD_DRIVER_OFFLINE_SOURCE})

 target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
 target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
+target_link_libraries(conv_add_fwd_driver_offline PRIVATE host_tensor)
+target_link_libraries(conv_activ_fwd_driver_offline PRIVATE host_tensor)
--- a/host/driver_offline/conv_activ_fwd_driver_offline.cpp
+++ b/host/driver_offline/conv_activ_fwd_driver_offline.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "host_conv.hpp"
+#include "device_tensor.hpp"
+#include "device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
+
+#define USE_DYNAMIC_MODE 0
+#define USE_CONV_FWD_V5R1_NCHW 1
+
+enum ConvForwardAlgo
+{
+    V5R1NCHW
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+
+    // static mode
+    if(argc < 7)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
+    const bool do_verification    = atoi(argv[3]);
+    const int init_method         = atoi(argv[4]);
+    const bool do_log             = atoi(argv[5]);
+    const int nrepeat             = atoi(argv[6]);
+
+#if 1
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 540;
+    constexpr index_t Wi = 960;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 480;
+    constexpr index_t Wi = 270;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 240;
+    constexpr index_t Wi = 135;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 540;
+    constexpr index_t Wi = 960;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 480;
+    constexpr index_t Wi = 270;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 8;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 4;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#endif
+
+    const index_t conv_stride_h   = 1;
+    const index_t conv_stride_w   = 1;
+    const index_t conv_dilation_h = 1;
+    const index_t conv_dilation_w = 1;
+    const index_t in_left_pad_h   = 1;
+    const index_t in_left_pad_w   = 1;
+    const index_t in_right_pad_h  = 1;
+    const index_t in_right_pad_w  = 1;
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+#if 0
+    using in_data_t  = float;
+    using acc_data_t = float;
+    using out_data_t = float;
+#elif 1
+    using in_data_t      = half_t;
+    using acc_data_t     = float;
+    using out_data_t     = half_t;
+#elif 1
+    using in_data_t      = int8_t;
+    using acc_data_t     = int32_t;
+    using out_data_t     = int8_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        // NCHW
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(C);
+        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(C);
+        wei_lengths_host[2] = static_cast<std::size_t>(Y);
+        wei_lengths_host[3] = static_cast<std::size_t>(X);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(K);
+        out_lengths_host[2] = static_cast<std::size_t>(Ho);
+        out_lengths_host[3] = static_cast<std::size_t>(Wo);
+        break;
+    case ConvTensorLayout::NHWC:
+        // NHWC
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
+        in_lengths_host[3]  = static_cast<std::size_t>(C);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(Y);
+        wei_lengths_host[2] = static_cast<std::size_t>(X);
+        wei_lengths_host[3] = static_cast<std::size_t>(C);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(Ho);
+        out_lengths_host[2] = static_cast<std::size_t>(Wo);
+        out_lengths_host[3] = static_cast<std::size_t>(K);
+        break;
+    default: throw std::runtime_error("wrong! not implemented");
+    }
+
+    Tensor<in_data_t> in(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<out_data_t> out_host(out_lengths_host);
+    Tensor<out_data_t> out_device(out_lengths_host);
+    Tensor<out_data_t> add_device(out_lengths_host);
+
+    std::cout << "layout: " << layout << std::endl;
+    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 5:
+        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei.GenerateTensorValue(gen_wei, num_thread);
+    }
+
+    auto f_make_for_device_nchw = [&]() {
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    auto f_make_for_device_nhwc = [&]() {
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<Hi>{}, Number<Wi>{}, Number<C>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<Y>{}, Number<X>{}, Number<C>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<Ho>{}, Number<Wo>{}, Number<K>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    constexpr ck::index_t activ_type = 2;
+
+#if USE_CONV_FWD_V5R1_NCHW
+    if(algo == ConvForwardAlgo::V5R1NCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw<in_data_t,
+                                                                            8,
+                                                                            8,
+                                                                            activ_type,
+                                                                            acc_data_t,
+                                                                            out_data_t>(tmp[I0],
+                                                                                        tmp[I1],
+                                                                                        tmp[I2],
+                                                                                        tmp[I3],
+                                                                                        tmp[I4],
+                                                                                        tmp[I5],
+                                                                                        tmp[I6],
+                                                                                        in,
+                                                                                        wei,
+                                                                                        out_device,
+                                                                                        nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution_activ(in,
+                                      wei,
+                                      out_host,
+                                      make_tuple(conv_stride_h, conv_stride_w),
+                                      make_tuple(conv_dilation_h, conv_dilation_w),
+                                      make_tuple(in_left_pad_h, in_left_pad_w),
+                                      make_tuple(in_right_pad_h, in_right_pad_w),
+                                      activ_type,
+                                      layout);
+
+        check_error(out_host, out_device);
+
+#if 0
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
+        }
+#endif
+    }
+}
--- a/host/driver_offline/conv_add_fwd_driver_offline.cpp
+++ b/host/driver_offline/conv_add_fwd_driver_offline.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "host_conv.hpp"
+#include "device_tensor.hpp"
+#include "device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
+
+#define USE_DYNAMIC_MODE 0
+#define USE_CONV_FWD_V5R1_NCHW 1
+
+enum ConvForwardAlgo
+{
+    V5R1NCHW
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+
+    // static mode
+    if(argc < 7)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
+    const bool do_verification    = atoi(argv[3]);
+    const int init_method         = atoi(argv[4]);
+    const bool do_log             = atoi(argv[5]);
+    const int nrepeat             = atoi(argv[6]);
+
+#if 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 540;
+    constexpr index_t Wi = 960;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 480;
+    constexpr index_t Wi = 270;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 1
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 240;
+    constexpr index_t Wi = 135;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 540;
+    constexpr index_t Wi = 960;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 480;
+    constexpr index_t Wi = 270;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 8;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 4;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#endif
+
+    const index_t conv_stride_h   = 1;
+    const index_t conv_stride_w   = 1;
+    const index_t conv_dilation_h = 1;
+    const index_t conv_dilation_w = 1;
+    const index_t in_left_pad_h   = 1;
+    const index_t in_left_pad_w   = 1;
+    const index_t in_right_pad_h  = 1;
+    const index_t in_right_pad_w  = 1;
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    const index_t Hox2 = Ho * 2;
+    const index_t Wox2 = Wo * 2;
+
+#if 0
+    using in_data_t  = float;
+    using acc_data_t = float;
+    using out_data_t = float;
+#elif 1
+    using in_data_t      = half_t;
+    using acc_data_t     = float;
+    using out_data_t     = half_t;
+#elif 1
+    using in_data_t      = int8_t;
+    using acc_data_t     = int32_t;
+    using out_data_t     = int8_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4),
+        add_lengths_host(4);
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        // NCHW
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(C);
+        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(C);
+        wei_lengths_host[2] = static_cast<std::size_t>(Y);
+        wei_lengths_host[3] = static_cast<std::size_t>(X);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(K);
+        out_lengths_host[2] = static_cast<std::size_t>(Ho);
+        out_lengths_host[3] = static_cast<std::size_t>(Wo);
+        add_lengths_host[0] = static_cast<std::size_t>(N);
+        add_lengths_host[1] = static_cast<std::size_t>(K);
+        add_lengths_host[2] = static_cast<std::size_t>(Hox2);
+        add_lengths_host[3] = static_cast<std::size_t>(Wox2);
+        break;
+    case ConvTensorLayout::NHWC:
+        // NHWC
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
+        in_lengths_host[3]  = static_cast<std::size_t>(C);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(Y);
+        wei_lengths_host[2] = static_cast<std::size_t>(X);
+        wei_lengths_host[3] = static_cast<std::size_t>(C);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(Ho);
+        out_lengths_host[2] = static_cast<std::size_t>(Wo);
+        out_lengths_host[3] = static_cast<std::size_t>(K);
+        add_lengths_host[0] = static_cast<std::size_t>(N);
+        add_lengths_host[1] = static_cast<std::size_t>(Hox2);
+        add_lengths_host[2] = static_cast<std::size_t>(Wox2);
+        add_lengths_host[3] = static_cast<std::size_t>(K);
+        break;
+    default: throw std::runtime_error("wrong! not implemented");
+    }
+
+    Tensor<in_data_t> in(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<in_data_t> add(add_lengths_host);
+    Tensor<out_data_t> out_host(add_lengths_host);
+    Tensor<out_data_t> out_device(add_lengths_host);
+
+    std::cout << "layout: " << layout << std::endl;
+    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(add.mDesc, std::cout << "add: ");
+    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 5:
+        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei.GenerateTensorValue(gen_wei, num_thread);
+    }
+
+    auto f_make_for_device_nchw = [&]() {
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
+        const auto add_lengths_dev =
+            make_tuple(Number<N>{}, Number<K>{}, Number<Hox2>{}, Number<Wox2>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          add_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    auto f_make_for_device_nhwc = [&]() {
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<Hi>{}, Number<Wi>{}, Number<C>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<Y>{}, Number<X>{}, Number<C>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<Ho>{}, Number<Wo>{}, Number<K>{});
+        const auto add_lengths_dev =
+            make_tuple(Number<N>{}, Number<Hox2>{}, Number<Wox2>{}, Number<K>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          add_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    constexpr ck::index_t activ_type = 2;
+
+#if USE_CONV_FWD_V5R1_NCHW
+    if(algo == ConvForwardAlgo::V5R1NCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+#if 1
+        device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw<in_data_t,
+                                                                                8,
+                                                                                8,
+                                                                                activ_type,
+                                                                                acc_data_t,
+                                                                                out_data_t>(
+            tmp[I0], // in_lengths_dev
+            tmp[I1], // wei_lengths_dev
+            tmp[I2], // add_lengths_dev
+            tmp[I3], // out_lengths_dev
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            tmp[I7],
+            in,
+            wei,
+            add,
+            out_device,
+            nrepeat);
+#endif
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution_add(in,
+                                    wei,
+                                    add,
+                                    out_host,
+                                    make_tuple(conv_stride_h, conv_stride_w),
+                                    make_tuple(conv_dilation_h, conv_dilation_w),
+                                    make_tuple(in_left_pad_h, in_left_pad_w),
+                                    make_tuple(in_right_pad_h, in_right_pad_w),
+                                    activ_type,
+                                    layout);
+
+        check_error(out_host, out_device);
+
+#if 0
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
+        }
+#endif
+    }
+}
--- a/host/driver_offline/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/conv_fwd_driver_offline.cpp
@@ -15,16 +15,15 @@
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-#include "device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"

-#define USE_DYNAMIC_MODE 0
-#define USE_CONV_FWD_V4R4_NCHW 0
+#define USE_DYNAMIC_MODE 1
+#define USE_CONV_FWD_V4R4_NCHW 1
 #define USE_CONV_FWD_V4R4R2_NHWC 0
 #define USE_CONV_FWD_V6R1_NCHW 0
-#define USE_CONV_FWD_V5R1_NCHW 1
+#define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0

@@ -103,82 +102,16 @@ int main(int argc, char* argv[])
    const bool do_log             = atoi(argv[5]);
    const int nrepeat             = atoi(argv[6]);

-#if 1
-    constexpr index_t N           = 1;
-    constexpr index_t C           = 16;
-    constexpr index_t Hi          = 1080;
-    constexpr index_t Wi          = 1920;
-    constexpr index_t K           = 16;
-    constexpr index_t Y           = 3;
-    constexpr index_t X           = 3;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 540;
-    constexpr index_t Wi = 960;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 480;
-    constexpr index_t Wi = 270;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 240;
-    constexpr index_t Wi = 135;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 1080;
-    constexpr index_t Wi = 1920;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 540;
-    constexpr index_t Wi = 960;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 480;
-    constexpr index_t Wi = 270;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 8;
-    constexpr index_t Hi = 1080;
-    constexpr index_t Wi = 1920;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 1080;
-    constexpr index_t Wi = 1920;
-    constexpr index_t K  = 4;
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 192;
+    constexpr index_t Hi = 71;
+    constexpr index_t Wi = 71;
+    constexpr index_t K  = 256;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
-#endif

-    const index_t conv_stride_h   = 1;
-    const index_t conv_stride_w   = 1;
+    const index_t conv_stride_h   = 2;
+    const index_t conv_stride_w   = 2;
    const index_t conv_dilation_h = 1;
    const index_t conv_dilation_w = 1;
    const index_t in_left_pad_h   = 1;
@@ -193,7 +126,7 @@ int main(int argc, char* argv[])
    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
 #endif

-#if 0
+#if 1
    using in_data_t  = float;
    using acc_data_t = float;
    using out_data_t = float;
@@ -437,8 +370,6 @@ int main(int argc, char* argv[])
    }
 #endif

-    constexpr ck::index_t activ_type = 2;
-
 #if USE_CONV_FWD_V5R1_NCHW
    if(algo == ConvForwardAlgo::V5R1NCHW)
    {
@@ -449,12 +380,10 @@ int main(int argc, char* argv[])

        const auto tmp = f_make_for_device_nchw();

-#if 1
-        device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
-#else
-        device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
-#endif
-            <in_data_t, 8, 8, activ_type, acc_data_t, out_data_t>(tmp[I0],
+        device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw<in_data_t,
+                                                                             16,
+                                                                             acc_data_t,
+                                                                             out_data_t>(tmp[I0],
                                                                                         tmp[I1],
                                                                                         tmp[I2],
                                                                                         tmp[I3],
@@ -524,14 +453,13 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        host_direct_convolution_activ(in,
+        host_direct_convolution(in,
                                wei,
                                out_host,
                                make_tuple(conv_stride_h, conv_stride_w),
                                make_tuple(conv_dilation_h, conv_dilation_w),
                                make_tuple(in_left_pad_h, in_left_pad_w),
                                make_tuple(in_right_pad_h, in_right_pad_w),
-                                      activ_type,
                                layout);

        check_error(out_host, out_device);

--- a/host/driver_offline/include/device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp"
+
+template <typename TInWei,
+          ck::index_t InWeiVectorSize,
+          ck::index_t OutVectorSize,
+          ck::index_t activ_type,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename AddLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const AddLengths& add_n_k_hox2_wox2_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    const Tensor<TOut>& add_n_k_hox2_wox2,
+    Tensor<TOut>& out_n_k_hox2_wox2,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = out_n_k_ho_wo_lengths[I0];
+    const auto K = out_n_k_ho_wo_lengths[I1];
+    const auto C = wei_k_c_y_x_lengths[I1];
+
+    const auto Hi = in_n_c_hi_wi_lengths[I2];
+    const auto Wi = in_n_c_hi_wi_lengths[I3];
+
+    const auto Ho = out_n_k_ho_wo_lengths[I2];
+    const auto Wo = out_n_k_ho_wo_lengths[I3];
+
+    const auto Hox2 = Ho * 2;
+    const auto Wox2 = Wo * 2;
+
+    const auto Y = wei_k_c_y_x_lengths[I2];
+    const auto X = wei_k_c_y_x_lengths[I3];
+
+    const auto C0 = C / Number<InWeiVectorSize>{};
+    const auto C1 = Number<InWeiVectorSize>{};
+
+    const auto K0 = K / Number<OutVectorSize>{};
+    const auto K1 = Number<OutVectorSize>{};
+
+    Tensor<TInWei> in_n_c0_hi_wi_c1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
+    Tensor<TInWei> wei_k_c0_y_x_c1(
+        HostTensorDescriptor(std::initializer_list<index_t>{K, C0, Y, X, C1}));
+    Tensor<TOut> out_n_k0_hox2_wox2_k1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Hox2, Wox2, K1}));
+    Tensor<TOut> add_n_k0_hox2_wox2_k1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Hox2, Wox2, K1}));
+
+    auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
+        in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) =
+            in_n_c_hi_wi(n, c, hi, wi);
+    };
+
+    auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) {
+        wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) =
+            wei_k_c_y_x(k, c, y, x);
+    };
+
+    auto f_nchx2wx2_to_nc0hx2wx2c1 = [&](auto n, auto ho, auto wo, auto c) {
+        add_n_k0_hox2_wox2_k1(n, c / InWeiVectorSize, ho, wo, c % InWeiVectorSize) =
+            add_n_k_hox2_wox2(n, c, ho, wo);
+    };
+
+    make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
+    make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
+    make_ParallelTensorFunctor(f_nchx2wx2_to_nc0hx2wx2c1, N, Hox2, Wox2, K)();
+
+    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
+                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
+    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
+    DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) *
+                                               add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace());
+    DeviceMem out_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) *
+                                               out_n_k0_hox2_wox2_k1.mDesc.GetElementSpace());
+
+    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+
+    const auto in_n_c0_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C0, Hi, Wi));
+    const auto wei_k_c0_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C0, Y, X));
+    const auto out_n_k0_ho_wo_k1_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1));
+    const auto add_n_k0_hox2_wox2_k1_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Hox2, Wox2, K1));
+
+    // cdata = 64, BlockSize = 64, 16x8x32x4
+    constexpr index_t BlockSize = 64;
+
+    constexpr index_t KPerBlock  = K;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
+    constexpr index_t EPerBlock  = C0;
+
+    constexpr index_t KPerThread  = KPerBlock;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = EPerBlock;
+
+    using ABlockTransferThreadSliceLengths_E_K   = Sequence<Y * X, 1>;
+    using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, KPerBlock>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
+    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
+
+    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
+
+    constexpr index_t CThreadTransferDstScalarPerVector_W = K1;
+
+    static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
+
+    constexpr auto conv_driver =
+        DriverStaticConvolutionAddForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad<
+            BlockSize,
+            typename vector_type<TInWei, InWeiVectorSize>::type,
+            TAcc,
+            TOut,
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            BThreadTransferSrcScalarPerVector_W,
+            CThreadTransferDstScalarPerVector_W>{};
+
+    conv_driver.Run(wei_k_c0_y_x_desc,
+                    in_n_c0_hi_wi_desc,
+                    add_n_k0_hox2_wox2_k1_desc,
+                    out_n_k0_ho_wo_k1_desc,
+                    conv_strides,
+                    conv_dilations,
+                    in_left_pads,
+                    in_right_pads,
+                    Number<activ_type>{},
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(out_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()));
+
+    out_n_k0_hox2_wox2_k1_device_buf.FromDevice(out_n_k0_hox2_wox2_k1.mData.data());
+
+    auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) {
+        out_n_k_hox2_wox2(n, k, ho, wo) =
+            out_n_k0_hox2_wox2_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize);
+    };
+
+    make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Hox2, Wox2)();
+}
--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
@@ -188,6 +188,115 @@ void host_direct_convolution_activ(const Tensor<TIn>& in,
    }
 }

+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_add(const Tensor<TIn>& in,
+                                 const Tensor<TWei>& wei,
+                                 const Tensor<TOut>& add,
+                                 Tensor<TOut>& out,
+                                 const ConvStrides& conv_strides,
+                                 const ConvDilations& conv_dilations,
+                                 const InLeftPads& in_left_pads,
+                                 const InRightPads& in_right_pads,
+                                 const ck::index_t activ_type,
+                                 const ConvTensorLayout layout = ConvTensorLayout::NCHW)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        v += static_cast<const double>(in(n, c, hi, wi)) *
+                             static_cast<const double>(wei(k, c, y, x));
+                    }
+                }
+            }
+        }
+
+        index_t hox2 = ho * 2;
+        index_t wox2 = wo * 2;
+
+        v = activ(v, activ_type);
+
+        out(n, k, hox2, wox2)         = v + add(n, k, hox2, wox2);
+        out(n, k, hox2, wox2 + 1)     = v + add(n, k, hox2, wox2 + 1);
+        out(n, k, hox2 + 1, wox2)     = v + add(n, k, hox2 + 1, wox2);
+        out(n, k, hox2 + 1, wox2 + 1) = v + add(n, k, hox2 + 1, wox2 + 1);
+    };
+
+    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[2])
+                    {
+                        v += static_cast<const double>(in(n, hi, wi, c)) *
+                             static_cast<const double>(wei(k, y, x, c));
+                    }
+                }
+            }
+        }
+
+        index_t hox2 = ho * 2;
+        index_t wox2 = wo * 2;
+
+        v = activ(v, activ_type);
+
+        out(n, k, hox2, wox2)         = v + add(n, k, hox2, wox2);
+        out(n, k, hox2, wox2 + 1)     = v + add(n, k, hox2, wox2 + 1);
+        out(n, k, hox2 + 1, wox2)     = v + add(n, k, hox2 + 1, wox2);
+        out(n, k, hox2 + 1, wox2 + 1) = v + add(n, k, hox2 + 1, wox2 + 1);
+    };
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2] / 2,
+                                   out.mDesc.GetLengths()[3] /
+                                       2)(std::thread::hardware_concurrency());
+        break;
+    case ConvTensorLayout::NHWC:
+        make_ParallelTensorFunctor(f_nhwc,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2] / 2,
+                                   out.mDesc.GetLengths()[3] /
+                                       2)(std::thread::hardware_concurrency());
+        break;
+    default: throw std::runtime_error("wrong! not supported layout");
+    }
+}
+
 template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>
 void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
                                   const Tensor<TWei>& wei_kcyx,

--- a/script/run.sh
+++ b/script/run.sh
@@ -12,7 +12,7 @@
 #export OLC_DEBUG_HIP_DUMP=1
 #export OLC_DEBUG_SAVE_TEMP_DIR=1

- make -j conv_fwd_driver_offline
+ make -j conv_add_fwd_driver_offline
 #make -j conv_bwd_driver_offline
 #make -j conv_fwd_driver_online

@@ -26,7 +26,7 @@ INIT=$4
 LOG=$5
 REPEAT=$6

-./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT
+./host/driver_offline/conv_add_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT

 ################################################ layout  algo  verify  init  log  repeat  N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  128  192 3 3  71   71     2 2       1 1      1 1       1 1