clean up

506a823a · Chao Liu · 80901f59 · 506a823a · 80901f59 · 80901f59
Commit 506a823a authored May 30, 2020 by Chao Liu
15 changed files
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -2,7 +2,7 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_operation_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
 template <typename T,
@@ -770,47 +770,45 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    constexpr auto gridwise_conv =
+    using gridwise_conv = GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer<
-        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer<
+        GridSize,
-            GridSize,
+        BlockSize,
-            BlockSize,
+        T,
-            T,
+        T,
-            T,
+        decltype(in_nchw_desc),
-            decltype(in_nchw_desc),
+        decltype(wei_kcyx_desc),
-            decltype(wei_kcyx_desc),
+        decltype(out_nkhw_desc),
-            decltype(out_nkhw_desc),
+        ConvStrides,
-            ConvStrides,
+        ConvDilations,
-            ConvDilations,
+        LeftPads,
-            LeftPads,
+        RightPads,
-            RightPads,
+        BPerBlock,
-            ConvolutionDirection::Forward,
+        KPerBlock,
-            BPerBlock,
+        EPerBlock,
-            KPerBlock,
+        GemmNRepeat,
-            EPerBlock,
+        GemmMPerThread,
-            GemmNRepeat,
+        GemmNPerThread,
-            GemmMPerThread,
+        GemmKPerThread,
-            GemmNPerThread,
+        GemmMLevel0Cluster,
-            GemmKPerThread,
+        GemmNLevel0Cluster,
-            GemmMLevel0Cluster,
+        GemmMLevel1Cluster,
-            GemmNLevel0Cluster,
+        GemmNLevel1Cluster,
-            GemmMLevel1Cluster,
+        GemmDataPerReadA,
-            GemmNLevel1Cluster,
+        GemmDataPerReadB,
-            GemmDataPerReadA,
+        InBlockCopySubLengths_E_N1_B_N2,
-            GemmDataPerReadB,
+        InBlockCopyClusterLengths_E_N1_B_N2,
-            InBlockCopySubLengths_E_N1_B_N2,
+        InBlockCopyThreadClusterArrangeOrder,
-            InBlockCopyClusterLengths_E_N1_B_N2,
+        InBlockCopySrcAccessOrder,
-            InBlockCopyThreadClusterArrangeOrder,
+        InBlockCopyDstAccessOrder,
-            InBlockCopySrcAccessOrder,
+        InBlockCopySrcDataPerRead_B,
-            InBlockCopyDstAccessOrder,
+        InBlockCopyDstDataPerWrite_N2,
-            InBlockCopySrcDataPerRead_B,
+        WeiBlockCopySubLengths_E_K,
-            InBlockCopyDstDataPerWrite_N2,
+        WeiBlockCopyClusterLengths_E_K,
-            WeiBlockCopySubLengths_E_K,
+        WeiBlockCopyThreadClusterArrangeOrder,
-            WeiBlockCopyClusterLengths_E_K,
+        WeiBlockCopySrcAccessOrder,
-            WeiBlockCopyThreadClusterArrangeOrder,
+        WeiBlockCopyDstAccessOrder,
-            WeiBlockCopySrcAccessOrder,
+        WeiBlockCopySrcDataPerRead_E,
-            WeiBlockCopyDstAccessOrder,
+        WeiBlockCopyDstDataPerWrite_K>;
-            WeiBlockCopySrcDataPerRead_E,
-            WeiBlockCopyDstDataPerWrite_K>{};
    for(index_t i = 0; i < 5; ++i)
    {
@@ -821,7 +819,10 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
        for(index_t j = 0; j < nrepeat; ++j)
        {
-            launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), TDevice>,
+            launch_kernel(run_gridwise_operation<gridwise_conv,
+                                                 const TDevice* const __restrict__,
+                                                 const TDevice* const __restrict__,
+                                                 TDevice* const __restrict__>,
                          dim3(GridSize),
                          dim3(BlockSize),
                          0,

--- a/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp"
-using namespace ck;
-template <class T,
-          class InDesc,
-          class WeiDesc,
-          class OutDesc,
-          class ConvStrides,
-          class ConvDilations>
-void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
-                                                          const Tensor<T>& in_nchw,
-                                                          WeiDesc,
-                                                          const Tensor<T>& wei_kcyx,
-                                                          OutDesc,
-                                                          Tensor<T>& out_nkhw,
-                                                          ConvStrides,
-                                                          ConvDilations,
-                                                          index_t nrepeat)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-#if 0
-    // 1x1 filter, 8x8 image
-    constexpr index_t N0  = 1;
-    constexpr index_t Ho0 = 2;
-    constexpr index_t Wo0 = 1;
-    constexpr index_t N2  = 4;
-    constexpr index_t Ho2 = 1;
-    constexpr index_t Wo2 = 1;
-    constexpr index_t BlockSize = 256;
-    constexpr index_t BPerBlock = 16;
-    constexpr index_t KPerBlock = 128;
-    constexpr index_t EPerBlock = 8;
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-    using InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2     = Sequence<1, 1, 1, 1, 1, 4, 1, 1>;
-    using InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2 = Sequence<8, 1, 2, 1, 16, 1, 1, 1>;
-    using InBlockCopyThreadClusterArrangeOrder =
-        Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
-    using InBlockCopySrcAccessOrder =
-        Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
-    using InBlockCopyDstAccessOrder =
-        Sequence<0, 1, 2, 3, 4, 5, 6, 7>; // [E, N0, Ho0, Wo0, B, N2, Ho2, Wo2]
-    constexpr index_t InBlockCopyDataPerAccess_W2 = 1;
-    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
-    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
-    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
-    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 1
-    // 1x1 filter, 8x8 image
-    constexpr index_t N0  = 1;
-    constexpr index_t Ho0 = 2;
-    constexpr index_t Wo0 = 1;
-    constexpr index_t N2  = 2;
-    constexpr index_t Ho2 = 2;
-    constexpr index_t Wo2 = 1;
-    constexpr index_t BlockSize = 256;
-    constexpr index_t BPerBlock = 16;
-    constexpr index_t KPerBlock = 128;
-    constexpr index_t EPerBlock = 8;
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-    using InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2     = Sequence<1, 1, 2, 1, 1, 2, 1, 1>;
-    using InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2 = Sequence<8, 1, 1, 1, 16, 1, 2, 1>;
-    using InBlockCopyThreadClusterArrangeOrder =
-        Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
-    using InBlockCopySrcAccessOrder =
-        Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
-    using InBlockCopyDstAccessOrder =
-        Sequence<0, 1, 2, 3, 4, 5, 6, 7>; // [E, N0, Ho0, Wo0, B, N2, Ho2, Wo2]
-    constexpr index_t InBlockCopyDataPerAccess_W2 = 1;
-    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
-    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
-    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
-    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#endif
-    constexpr index_t N1  = N / (N0 * N2);
-    constexpr index_t Ho1 = Ho / (Ho0 * Ho2);
-    constexpr index_t Wo1 = Wo / (Wo0 * Wo2);
-    constexpr index_t B = N1 * Ho1 * Wo1;
-    constexpr index_t GridSize =
-        ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        constexpr auto gridwise_conv =
-            GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer<
-                GridSize,
-                BlockSize,
-                T,
-                decltype(in_nchw_desc),
-                decltype(wei_kcyx_desc),
-                decltype(out_nkhw_desc),
-                ConvStrides,
-                ConvDilations,
-                N1,
-                N2,
-                Ho1,
-                Ho2,
-                Wo1,
-                Wo2,
-                BPerBlock,
-                KPerBlock,
-                EPerBlock,
-                GemmMPerThreadSubC,
-                GemmNPerThreadSubC,
-                GemmMLevel0Cluster,
-                GemmNLevel0Cluster,
-                GemmMLevel1Cluster,
-                GemmNLevel1Cluster,
-                GemmKPerThreadLoop,
-                GemmDataPerReadA,
-                GemmDataPerReadB,
-                InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2,
-                InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2,
-                InBlockCopyThreadClusterArrangeOrder,
-                InBlockCopySrcAccessOrder,
-                InBlockCopyDstAccessOrder,
-                InBlockCopyDataPerAccess_W2,
-                WeiBlockCopySubLengths_E_K,
-                WeiBlockCopyClusterLengths_E_K,
-                WeiBlockCopyThreadClusterArrangeOrder,
-                WeiBlockCopySrcAccessOrder,
-                WeiBlockCopyDstAccessOrder,
-                WeiBlockCopySrcDataPerRead_E,
-                WeiBlockCopyDstDataPerWrite_K>{};
-        float time =
-            launch_and_time_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp"
-using namespace ck;
-template <class T,
-          class InDesc,
-          class WeiDesc,
-          class OutDesc,
-          class ConvStrides,
-          class ConvDilations>
-void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc,
-                                                          const Tensor<T>& in_nchw,
-                                                          WeiDesc,
-                                                          const Tensor<T>& wei_kcyx,
-                                                          OutDesc,
-                                                          Tensor<T>& out_nkhw,
-                                                          ConvStrides,
-                                                          ConvDilations,
-                                                          index_t nrepeat)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-#if 1
-    // 1x1 filter, 8x8 image
-    constexpr index_t N1  = 2;
-    constexpr index_t Ho1 = 1;
-    constexpr index_t Wo1 = 1;
-    constexpr index_t N2  = 1;
-    constexpr index_t Ho2 = 1;
-    constexpr index_t Wo2 = 4;
-    constexpr index_t BlockSize = 256;
-    constexpr index_t BPerBlock = 16;
-    constexpr index_t KPerBlock = 128;
-    constexpr index_t EPerBlock = 8;
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-    using InBlockCopySubLengths_E_N1_Ho1_Wo1_B_N2_Ho2_Wo2     = Sequence<1, 1, 1, 1, 1, 1, 1, 4>;
-    using InBlockCopyClusterLengths_E_N1_Ho1_Wo1_B_N2_Ho2_Wo2 = Sequence<8, 2, 1, 1, 16, 1, 1, 1>;
-    using InBlockCopyThreadClusterArrangeOrder =
-        Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N1, N2, Ho1, Ho2, Wo1, B, Wo2]
-    using InBlockCopySrcAccessOrder =
-        Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N1, N2, Ho1, Ho2, Wo1, B, Wo2]
-    using InBlockCopyDstAccessOrder =
-        Sequence<0, 1, 2, 3, 4, 5, 6, 7>; // [E, N1, Ho1, Wo1, B, N2, Ho2, Wo2]
-    constexpr index_t InBlockCopyDataPerAccess_W2 = 4;
-    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
-    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
-    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
-    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#endif
-    constexpr index_t N0  = N / (N1 * N2);
-    constexpr index_t Ho0 = Ho / (Ho1 * Ho2);
-    constexpr index_t Wo0 = Wo / (Wo1 * Wo2);
-    constexpr index_t B = N0 * Ho0 * Wo0;
-    constexpr index_t GridSize =
-        ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        constexpr auto gridwise_conv =
-            GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer<
-                GridSize,
-                BlockSize,
-                T,
-                decltype(in_nchw_desc),
-                decltype(wei_kcyx_desc),
-                decltype(out_nkhw_desc),
-                ConvStrides,
-                ConvDilations,
-                N0,
-                N1,
-                N2,
-                Ho0,
-                Ho1,
-                Ho2,
-                Wo0,
-                Wo1,
-                Wo2,
-                BPerBlock,
-                KPerBlock,
-                EPerBlock,
-                GemmMPerThreadSubC,
-                GemmNPerThreadSubC,
-                GemmMLevel0Cluster,
-                GemmNLevel0Cluster,
-                GemmMLevel1Cluster,
-                GemmNLevel1Cluster,
-                GemmKPerThreadLoop,
-                GemmDataPerReadA,
-                GemmDataPerReadB,
-                InBlockCopySubLengths_E_N1_Ho1_Wo1_B_N2_Ho2_Wo2,
-                InBlockCopyClusterLengths_E_N1_Ho1_Wo1_B_N2_Ho2_Wo2,
-                InBlockCopyThreadClusterArrangeOrder,
-                InBlockCopySrcAccessOrder,
-                InBlockCopyDstAccessOrder,
-                InBlockCopyDataPerAccess_W2,
-                WeiBlockCopySubLengths_E_K,
-                WeiBlockCopyClusterLengths_E_K,
-                WeiBlockCopyThreadClusterArrangeOrder,
-                WeiBlockCopySrcAccessOrder,
-                WeiBlockCopyDstAccessOrder,
-                WeiBlockCopySrcDataPerRead_E,
-                WeiBlockCopyDstDataPerWrite_K>{};
-        float time =
-            launch_and_time_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
 #include <unistd.h>
 #include "device.hpp"
 #include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_operation_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 template <class T,
@@ -120,7 +120,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
-#elif 0
+#elif 1
    // cdata = 64, BlockSize = 256, 128x128x8
    constexpr index_t BlockSize = 256;
@@ -793,7 +793,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 2;
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
-#elif 1
+#elif 0
    // cdata = 64, BlockSize = 64, 32x128x3
    constexpr index_t BlockSize = 64;
@@ -968,7 +968,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw<
+    using gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw<
        GridSize,
        BlockSize,
        TDevice,
@@ -1000,7 +1000,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
        GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
        GemmBBlockCopySrcDataPerRead_GemmN,
        GemmBBlockCopyDstDataPerWrite_GemmN,
-        GemmCThreadCopyDstDataPerWrite_GemmN1>{};
+        GemmCThreadCopyDstDataPerWrite_GemmN1>;
    for(index_t i = 0; i < 5; ++i)
    {
@@ -1011,7 +1011,10 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
        for(index_t j = 0; j < nrepeat; ++j)
        {
-            launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), TDevice>,
+            launch_kernel(run_gridwise_operation<gridwise_conv,
+                                                 const TDevice* const __restrict__,
+                                                 const TDevice* const __restrict__,
+                                                 TDevice* const __restrict__>,
                          dim3(GridSize),
                          dim3(BlockSize),
                          0,

--- a/driver/include/device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw.hpp
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw.hpp"
-template <class T,
-          class InDesc,
-          class WeiDesc,
-          class OutDesc,
-          class ConvStrides,
-          class ConvDilations,
-          class InLeftPads,
-          class InRightPads>
-void device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw(InDesc,
-                                                                      const Tensor<T>& in_nchw,
-                                                                      WeiDesc,
-                                                                      const Tensor<T>& wei_kcyx,
-                                                                      OutDesc,
-                                                                      Tensor<T>& out_nkhw,
-                                                                      ConvStrides,
-                                                                      ConvDilations,
-                                                                      InLeftPads,
-                                                                      InRightPads,
-                                                                      ck::index_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto in_nchw_desc =
-        make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
-    constexpr auto wei_kcyx_desc =
-        make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
-    constexpr auto out_nkhw_desc =
-        make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides());
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t K  = out_nkhw_desc.GetLength(I1);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-    // cdata = 64, BlockSize = 256, 128x128x16
-    constexpr index_t BlockSize = 128;
-    constexpr index_t GemmMPerBlock = 128;
-    constexpr index_t GemmNPerBlock = 64;
-    constexpr index_t GemmKPerBlock = 4;
-    constexpr index_t GemmKPACK = 4;
-    constexpr index_t GemmMPerWave = 64;
-    constexpr index_t GemmNPerWave = 64;
-    constexpr index_t ThreadGemmDataPerReadM = 1;
-    constexpr index_t ThreadGemmDataPerReadN = 1;
-    using GemmABlockCopyThreadSliceLengths_GemmK_GemmM_GemmKPACK   = Sequence<1, 4, 4>;
-    using GemmABlockCopyThreadClusterLengths_GemmK_GemmM_GemmKPACK = Sequence<4, 32, 1>;
-    constexpr index_t GemmABlockCopySrcDataPerRead_GemmKPACK  = 1;
-    constexpr index_t GemmABlockCopyDstDataPerWrite_GemmKPACK = 1;
-    using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN_GemmKPACK   = Sequence<1, 2, 4>;
-    using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN_GemmKPACK = Sequence<4, 32, 1>;
-    constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN      = 1;
-    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmKPACK = 1;
-    constexpr index_t GemmM = K;
-    constexpr index_t GemmN = N * Ho * Wo;
-    constexpr index_t GridSize = math::integer_divide_ceil(GemmM, GemmMPerBlock) *
-                                 math::integer_divide_ceil(GemmN, GemmNPerBlock);
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    constexpr auto gridwise_conv =
-        GridwiseConvolutionImplicitGemm_v4r4_xdlops_fwd_fp16_nchw_kcyx_nkhw<
-            GridSize,
-            BlockSize,
-            half,
-            float,
-            decltype(in_nchw_desc),
-            decltype(wei_kcyx_desc),
-            decltype(out_nkhw_desc),
-            ConvStrides,
-            ConvDilations,
-            InLeftPads,
-            InRightPads,
-            GemmMPerBlock,
-            GemmNPerBlock,
-            GemmKPerBlock,
-            GemmKPACK,
-            GemmMPerWave,
-            GemmNPerWave,
-            ThreadGemmDataPerReadM,
-            ThreadGemmDataPerReadN,
-            GemmABlockCopyThreadSliceLengths_GemmK_GemmM_GemmKPACK,
-            GemmABlockCopyThreadClusterLengths_GemmK_GemmM_GemmKPACK,
-            GemmABlockCopySrcDataPerRead_GemmKPACK,
-            GemmABlockCopyDstDataPerWrite_GemmKPACK,
-            GemmBBlockCopyThreadSliceLengths_GemmK_GemmN_GemmKPACK,
-            GemmBBlockCopyThreadClusterLengths_GemmK_GemmN_GemmKPACK,
-            GemmBBlockCopySrcDataPerRead_GemmN,
-            GemmBBlockCopyDstDataPerWrite_GemmKPACK>{};
-    for(index_t i = 0; i < 10; ++i)
-    {
-        float time =
-            launch_and_time_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   0,
-                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-    }
-    // warm up
-    printf("Warn up running %d times...\n", nrepeat);
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                      dim3(GridSize),
-                      dim3(BlockSize),
-                      0,
-                      0,
-                      static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                      static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                      static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-    }
-    printf("Start running %d times...\n", nrepeat);
-    cudaDeviceSynchronize();
-    auto start = std::chrono::steady_clock::now();
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                      dim3(GridSize),
-                      dim3(BlockSize),
-                      0,
-                      0,
-                      static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                      static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                      static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-    }
-    cudaDeviceSynchronize();
-    auto end = std::chrono::steady_clock::now();
-    float ave_time = std::chrono::duration<float, std::milli>(end - start).count() / nrepeat;
-    printf("Average elapsed time : %f ms, %f TFlop/s\n",
-           ave_time,
-           (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-               (std::size_t(1000) * 1000 * 1000) / ave_time);
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/include/device_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-template <class T,
-          class InDesc,
-          class WeiDesc,
-          class OutDesc,
-          class ConvStrides,
-          class ConvDilations,
-          class InLeftPads,
-          class InRightPads>
-void device_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(InDesc,
-                                                                 const Tensor<T>& in_nchw,
-                                                                 WeiDesc,
-                                                                 const Tensor<T>& wei_kcyx,
-                                                                 OutDesc,
-                                                                 Tensor<T>& out_nkhw,
-                                                                 ConvStrides,
-                                                                 ConvDilations,
-                                                                 InLeftPads,
-                                                                 InRightPads,
-                                                                 ck::index_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto in_nchw_desc =
-        make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
-    constexpr auto wei_kcyx_desc =
-        make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
-    constexpr auto out_nkhw_desc =
-        make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides());
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t K  = out_nkhw_desc.GetLength(I1);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-    // cdata = 64, BlockSize = 256, 128x128x16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmMPerBlock = 128;
-    constexpr index_t GemmNPerBlock = 128;
-    constexpr index_t GemmKPerBlock = 16;
-    constexpr index_t GemmMPerWave = 64;
-    constexpr index_t GemmNPerWave = 64;
-    constexpr index_t ThreadGemmDataPerReadM = 1;
-    constexpr index_t ThreadGemmDataPerReadN = 1;
-    using GemmABlockCopyThreadSliceLengths_GemmK_GemmM   = Sequence<4, 2>;
-    using GemmABlockCopyThreadClusterLengths_GemmK_GemmM = Sequence<4, 64>;
-    constexpr index_t GemmABlockCopySrcDataPerRead_GemmK  = 4;
-    constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM = 1;
-    using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN   = Sequence<4, 2>;
-    using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<4, 64>;
-    constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN  = 1;
-    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
-    constexpr index_t GemmM = K;
-    constexpr index_t GemmN = N * Ho * Wo;
-    constexpr index_t GridSize = math::integer_divide_ceil(GemmM, GemmMPerBlock) *
-                                 math::integer_divide_ceil(GemmN, GemmNPerBlock);
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    constexpr auto gridwise_conv =
-        GridwiseConvolutionImplicitGemm_v4r4_xdlops_fwd_fp32_nchw_kcyx_nkhw<
-            GridSize,
-            BlockSize,
-            T,
-            T,
-            decltype(in_nchw_desc),
-            decltype(wei_kcyx_desc),
-            decltype(out_nkhw_desc),
-            ConvStrides,
-            ConvDilations,
-            InLeftPads,
-            InRightPads,
-            GemmMPerBlock,
-            GemmNPerBlock,
-            GemmKPerBlock,
-            GemmMPerWave,
-            GemmNPerWave,
-            ThreadGemmDataPerReadM,
-            ThreadGemmDataPerReadN,
-            GemmABlockCopyThreadSliceLengths_GemmK_GemmM,
-            GemmABlockCopyThreadClusterLengths_GemmK_GemmM,
-            GemmABlockCopySrcDataPerRead_GemmK,
-            GemmABlockCopyDstDataPerWrite_GemmM,
-            GemmBBlockCopyThreadSliceLengths_GemmK_GemmN,
-            GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
-            GemmBBlockCopySrcDataPerRead_GemmN,
-            GemmBBlockCopyDstDataPerWrite_GemmN>{};
-    for(index_t i = 0; i < 10; ++i)
-    {
-        float time =
-            launch_and_time_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   0,
-                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-    }
-    // warm up
-    printf("Warn up running %d times...\n", nrepeat);
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                      dim3(GridSize),
-                      dim3(BlockSize),
-                      0,
-                      0,
-                      static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                      static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                      static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-    }
-    printf("Start running %d times...\n", nrepeat);
-    cudaDeviceSynchronize();
-    auto start = std::chrono::steady_clock::now();
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                      dim3(GridSize),
-                      dim3(BlockSize),
-                      0,
-                      0,
-                      static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                      static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                      static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-    }
-    cudaDeviceSynchronize();
-    auto end = std::chrono::steady_clock::now();
-    float ave_time = std::chrono::duration<float, std::milli>(end - start).count() / nrepeat;
-    printf("Average elapsed time : %f ms, %f TFlop/s\n",
-           ave_time,
-           (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-               (std::size_t(1000) * 1000 * 1000) / ave_time);
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
-using namespace ck;
-template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
-void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
-                                                           const Tensor<TInWei>& in_nchw,
-                                                           WeiDesc,
-                                                           const Tensor<TInWei>& wei_kcyx,
-                                                           OutDesc,
-                                                           Tensor<TOut>& out_nkhw,
-                                                           index_t nrepeat)
-{
-    // this suppose in / wei data type is int8x4
-    constexpr index_t NVector = 4;
-    using accum_t             = int32_t;
-    using vector_t            = vector_type<TInWei, NVector>;
-    using vector_mem_t        = typename vector_t::MemoryType;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-    // vectorized input
-    auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{});
-    ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: ");
-    Tensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
-    auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
-#if 0
-        in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
-#elif 0
-        in_nchw_vec(n, c, h, w) =
-            vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
-#elif 1
-        in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w),
-                                                 in_nchw(n, 4 * c + 1, h, w),
-                                                 in_nchw(n, 4 * c + 2, h, w),
-                                                 in_nchw(n, 4 * c + 3, h, w));
-#endif
-    };
-    make_ParallelTensorFunctor(f_vectorized_nchw, N, C / NVector, Hi, Wi)(
-        std::thread::hardware_concurrency());
-    // vectorize weight
-    auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{});
-    ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: ");
-    Tensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
-    auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
-#if 0
-        wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
-#elif 0
-        wei_kcyx_vec(k, c, y, x) =
-            vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x));
-#elif 1
-        wei_kcyx_vec(k, c, y, x) = vector_t::Pack(wei_kcyx(k, 4 * c, y, x),
-                                                  wei_kcyx(k, 4 * c + 1, y, x),
-                                                  wei_kcyx(k, 4 * c + 2, y, x),
-                                                  wei_kcyx(k, 4 * c + 3, y, x));
-#endif
-    };
-    make_ParallelTensorFunctor(f_vectorized_kcyx, K, C / NVector, Y, X)(
-        std::thread::hardware_concurrency());
-    //
-    DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(sizeof(TOut) * out_nkhw.mDesc.GetElementSpace());
-    in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data());
-    wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-#if 0
-    // 3x3, 34x34, 128 thread, fp32, vector = 1
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 32;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 32;
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 4;
-    constexpr index_t CPerThread  = 2;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t InBlockCopyDataPerRead  = 2;
-    constexpr index_t WeiBlockCopyDataPerRead = 2;
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // 3x3, 34x34, 128 thread, fp32, vector = 2
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 32;
-    constexpr index_t CPerBlock  = 2;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 32;
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 4;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t InBlockCopyDataPerRead  = 2;
-    constexpr index_t WeiBlockCopyDataPerRead = 2;
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // 3x3, 34x34, 128 thread, int8, vector = 4
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 32;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 32;
-    constexpr index_t NPerThread  = 1;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t CPerThread  = 2;
-    constexpr index_t HoPerThread = 4;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t InBlockCopyDataPerRead  = 2;
-    constexpr index_t WeiBlockCopyDataPerRead = 2;
-    constexpr index_t BlockSize = 128;
-#elif 1
-    // 1x1, 32x32, 128 thread, int8, vector = 4
-    constexpr index_t NPerBlock  = 1;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 16;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 32;
-    constexpr index_t NPerThread  = 1;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t CPerThread  = 2;
-    constexpr index_t HoPerThread = 4;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t InBlockCopyDataPerRead  = 2;
-    constexpr index_t WeiBlockCopyDataPerRead = 2;
-    constexpr index_t BlockSize = 128;
-#endif
-    constexpr index_t GridSize =
-        (N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock);
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        float time = launch_and_time_kernel(
-            gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw<TInWei,
-                                                                    TOut,
-                                                                    accum_t,
-                                                                    decltype(in_nchw_vec_desc),
-                                                                    decltype(wei_kcyx_vec_desc),
-                                                                    decltype(out_nkhw_desc),
-                                                                    NVector,
-                                                                    NPerBlock,
-                                                                    KPerBlock,
-                                                                    CPerBlock,
-                                                                    HoPerBlock,
-                                                                    WoPerBlock,
-                                                                    NPerThread,
-                                                                    KPerThread,
-                                                                    CPerThread,
-                                                                    HoPerThread,
-                                                                    WoPerThread,
-                                                                    InBlockCopyDataPerRead,
-                                                                    WeiBlockCopyDataPerRead,
-                                                                    BlockSize,
-                                                                    GridSize>,
-            dim3(GridSize),
-            dim3(BlockSize),
-            static_cast<TInWei*>(in_nchw_vec_device_buf.GetDeviceBuffer()),
-            static_cast<TInWei*>(wei_kcyx_vec_device_buf.GetDeviceBuffer()),
-            static_cast<TInWei*>(out_nkhw_device_buf.GetDeviceBuffer()));
-        printf("Elapsed time : %f ms\n", time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/include/host_col2im.hpp
+++ b/driver/include/host_col2im.hpp
-#pragma once
-#include "tensor.hpp"
-template <typename T,
-          typename FilterSizes,
-          typename OutputSizes,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename LeftPads,
-          typename RightPads>
-void host_col2im(const Tensor<T>& in_eb,
-                 Tensor<T>& in_nchw,
-                 FilterSizes,
-                 OutputSizes,
-                 ConvStrides,
-                 ConvDilations,
-                 LeftPads,
-                 RightPads)
-{
-    using namespace ck;
-    int N  = in_nchw.mDesc.GetLengths()[0];
-    int C  = in_nchw.mDesc.GetLengths()[1];
-    int HI = in_nchw.mDesc.GetLengths()[2];
-    int WI = in_nchw.mDesc.GetLengths()[3];
-    int Y = FilterSizes{}[0];
-    int X = FilterSizes{}[1];
-    int HO = OutputSizes{}[0];
-    int WO = OutputSizes{}[1];
-    auto f = [&](auto n, auto c, auto hi, auto wi) {
-        double v = 0;
-        for(int y = 0; y < Y; ++y)
-        {
-            int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
-            if(h_tmp >= 0 && h_tmp < HI && h_tmp % ConvStrides{}[0] == 0)
-            {
-                int ho = h_tmp / ConvStrides{}[0];
-                for(int x = 0; x < X; ++x)
-                {
-                    int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
-                    if(w_tmp >= 0 && w_tmp < WI && w_tmp % ConvStrides{}[1] == 0)
-                    {
-                        int wo = w_tmp / ConvStrides{}[1];
-                        int e = c * (Y * X) + y * X + x;
-                        int b = n * (HO * WO) + ho * WO + wo;
-                        v += in_eb(e, b);
-                    }
-                }
-            }
-        }
-        in_nchw(n, c, hi, wi) = v;
-    };
-    auto f_par = make_ParallelTensorFunctor(f,
-                                            in_nchw.mDesc.GetLengths()[0],
-                                            in_nchw.mDesc.GetLengths()[1],
-                                            in_nchw.mDesc.GetLengths()[2],
-                                            in_nchw.mDesc.GetLengths()[3]);
-    f_par(std::thread::hardware_concurrency());
-}
--- a/driver/src/col2im_driver.cpp
+++ b/driver/src/col2im_driver.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include "config.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "print_array.hpp"
-#include "print_sequence.hpp"
-#include "device.hpp"
-#include "tensor_generator.hpp"
-#include "device_tensor.hpp"
-#include "conv_common.hpp"
-#include "host_col2im.hpp"
-#include "device_col2im_eb_nchw.hpp"
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-#if 1
-    constexpr index_t N  = 2;
-    constexpr index_t C  = 8;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 4;
-    constexpr index_t X  = 4;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<2, 2>;
-#elif 0
-    // 3x3, 34x34
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 34;
-    constexpr index_t WI = 34;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 1536;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 2048;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 7x7 image
-    // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 832;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 1280;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 14x14 image
-    // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 512;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61%
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 1536;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 28x28 image
-    // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 28;
-    constexpr index_t WI = 28;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 7x7 image
-    // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 832;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 17x17 input
-    // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 768;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 14x14 image
-    // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 528;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 14x14 image
-    // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 528;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 7x7 image
-    // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 832;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
-    // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 288;
-    constexpr index_t HI = 35;
-    constexpr index_t WI = 35;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 5x5 filter, 2x2 pad, 7x7 input
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 48;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 5;
-    constexpr index_t X  = 5;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<2, 2>;
-    using RightPads = Sequence<2, 2>;
-#elif 0
-    // 7x1 filter, 3x0 pad, 17x17 input
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 7;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<3, 0>;
-    using RightPads = Sequence<3, 0>;
-#elif 1
-    // 1x7 filter, 0x3 pad, 17x17 input
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 7;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 3>;
-    using RightPads = Sequence<0, 3>;
-#endif
-    constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
-    constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
-    constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
-        img_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
-    constexpr index_t HO = out_nkhw_desc.GetLengths()[2];
-    constexpr index_t WO = out_nkhw_desc.GetLengths()[3];
-    constexpr auto col_eb_desc =
-        make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});
-    using FilterSizes = Sequence<Y, X>;
-    using OutputSizes = Sequence<HO, WO>;
-    ostream_ConstantTensorDescriptor(col_eb_desc, std::cout << "col_eb_desc: ");
-    ostream_ConstantTensorDescriptor(img_nchw_desc, std::cout << "img_nchw_desc: ");
-    print_sequence("FilterSizes", FilterSizes{});
-    print_sequence("OutputSizes", OutputSizes{});
-    print_sequence("LeftPads", LeftPads{});
-    print_sequence("LeftPads", LeftPads{});
-    print_sequence("RightPads", RightPads{});
-    print_sequence("ConvStrides", ConvStrides{});
-    print_sequence("ConvDilations", ConvDilations{});
-    Tensor<float> col_eb(make_TensorDescriptor(col_eb_desc));
-    Tensor<float> img_nchw_host(make_TensorDescriptor(img_nchw_desc));
-    Tensor<float> img_nchw_device(make_TensorDescriptor(img_nchw_desc));
-    std::size_t num_thread = std::thread::hardware_concurrency();
-    if(argc != 3)
-    {
-        printf("arg1: do_verification, arg2: nrepeat\n");
-        exit(1);
-    }
-    bool do_verification = atoi(argv[1]);
-    std::size_t nrepeat  = atoi(argv[2]);
-    if(do_verification)
-    {
-#if 0
-        col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-#else
-        col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-#endif
-    }
-    device_col2im_eb_nchw(col_eb_desc,
-                          col_eb,
-                          img_nchw_desc,
-                          img_nchw_device,
-                          FilterSizes{},
-                          OutputSizes{},
-                          ConvStrides{},
-                          ConvDilations{},
-                          LeftPads{},
-                          RightPads{},
-                          nrepeat);
-    if(do_verification)
-    {
-        host_col2im(col_eb,
-                    img_nchw_host,
-                    FilterSizes{},
-                    OutputSizes{},
-                    ConvStrides{},
-                    ConvDilations{},
-                    LeftPads{},
-                    RightPads{});
-        check_error(img_nchw_host, img_nchw_device);
-#if 0
-        LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
-        LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
-        LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
-#endif
-    }
-}
--- a/driver/src/col2im_driver.cu
+++ b/driver/src/col2im_driver.cu
-col2im_driver.cpp
\ No newline at end of file
--- a/driver/src/conv_bwd_data_driver.cpp
+++ b/driver/src/conv_bwd_data_driver.cpp
@@ -245,9 +245,9 @@ int main(int argc, char* argv[])
 #endif
    }
-#if 1
+#if 0
    device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
-#elif 1
+#elif 0
    device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
 #elif 0
    device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
@@ -256,17 +256,17 @@ int main(int argc, char* argv[])
 #elif 1
    device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
 #endif
-        (in_nchw_desc,
+    (in_nchw_desc,
-         in_nchw_device,
+     in_nchw_device,
-         wei_kcyx_desc,
+     wei_kcyx_desc,
-         wei_kcyx,
+     wei_kcyx,
-         out_nkhw_desc,
+     out_nkhw_desc,
-         out_nkhw,
+     out_nkhw,
-         ConvStrides{},
+     ConvStrides{},
-         ConvDilations{},
+     ConvDilations{},
-         LeftPads{},
+     LeftPads{},
-         RightPads{},
+     RightPads{},
-         nrepeat);
+     nrepeat);
    if(do_verification)
    {

--- a/driver/src/conv_bwd_data_driver.cu
+++ b/driver/src/conv_bwd_data_driver.cu
-conv_bwd_data_driver.cpp
\ No newline at end of file
--- a/driver/src/conv_bwd_data_driver.cu
+++ b/driver/src/conv_bwd_data_driver.cu
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include "config.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "print_array.hpp"
+#include "print_sequence.hpp"
+#include "device.hpp"
+#include "tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "conv_common.hpp"
+#include "host_conv_bwd_data.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
+int main(int argc, char* argv[])
+{
+    using namespace launcher;
+#if 0
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 56;
+    constexpr index_t WI = 56;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 3x3, 34x34
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 34;
+    constexpr index_t WI = 34;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 3x3, 28x28
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 1024;
+    constexpr index_t HI = 28;
+    constexpr index_t WI = 28;
+    constexpr index_t K  = 1024;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    constexpr index_t N  = 256;
+    constexpr index_t C  = 1024;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 1024;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 7x7 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 1024;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 1024;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 14x14 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 512;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 28x28 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 28;
+    constexpr index_t WI = 28;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 17x17 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 1024;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 1024;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 5x5 filter, 2x2 pad, 7x7 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 1024;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 1024;
+    constexpr index_t Y  = 5;
+    constexpr index_t X  = 5;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<2, 2>;
+    using RightPads = Sequence<2, 2>;
+#elif 0
+    // 1x7 filter, 0x3 pad, 17x17 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 7;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<0, 3>;
+    using RightPads = Sequence<0, 3>;
+#elif 0
+    // 7x1 filter, 3x0 pad, 17x17 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 1024;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 1024;
+    constexpr index_t Y  = 7;
+    constexpr index_t X  = 1;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<3, 0>;
+    using RightPads = Sequence<3, 0>;
+#elif 1
+    // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 35;
+    constexpr index_t WI = 35;
+    constexpr index_t K  = 1024;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+    using ConvStrides   = Sequence<2, 2>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#endif
+    constexpr auto in_nchw_desc  = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
+    constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
+        in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
+    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
+    ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
+    ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
+    print_sequence("LeftPads", LeftPads{});
+    print_sequence("LeftPads", LeftPads{});
+    print_sequence("RightPads", RightPads{});
+    print_sequence("ConvStrides", ConvStrides{});
+    print_sequence("ConvDilations", ConvDilations{});
+    Tensor<float> in_nchw_device(make_TensorDescriptor(in_nchw_desc));
+    Tensor<float> in_nchw_host(make_TensorDescriptor(in_nchw_desc));
+    Tensor<float> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
+    Tensor<float> out_nkhw(make_TensorDescriptor(out_nkhw_desc));
+    std::size_t num_thread = std::thread::hardware_concurrency();
+    if(argc != 3)
+    {
+        printf("arg1: do_verification, arg2: nrepeat\n");
+        exit(1);
+    }
+    bool do_verification = atoi(argv[1]);
+    std::size_t nrepeat  = atoi(argv[2]);
+    if(do_verification)
+    {
+#if 0
+        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
+        out_nkhw.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
+#else
+        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+#endif
+    }
+#if 0
+    device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
+#elif 0
+    device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
+#elif 0
+    device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
+#elif 0
+    device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw
+#elif 1
+    device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
+#endif
+    (in_nchw_desc,
+     in_nchw_device,
+     wei_kcyx_desc,
+     wei_kcyx,
+     out_nkhw_desc,
+     out_nkhw,
+     ConvStrides{},
+     ConvDilations{},
+     LeftPads{},
+     RightPads{},
+     nrepeat);
+    if(do_verification)
+    {
+        host_direct_convolution_backward_data(in_nchw_host,
+                                              wei_kcyx,
+                                              out_nkhw,
+                                              ConvStrides{},
+                                              ConvDilations{},
+                                              LeftPads{},
+                                              RightPads{});
+        check_error(in_nchw_host, in_nchw_device);
+#if 0
+        LogRange(std::cout << "out_nkhw : ", out_nkhw.mData, ",") << std::endl;
+        LogRange(std::cout << "wei_kcyx : ", wei_kcyx.mData, ",") << std::endl;
+        LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl;
+        LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl;
+#endif
+    }
+}
--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -15,9 +15,6 @@
 #include "device_tensor.hpp"
 #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_fp16.hpp"
-//#include "device_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-//#include "device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw.hpp"
 int main(int argc, char* argv[])
 {
@@ -570,31 +567,6 @@ int main(int argc, char* argv[])
    }
 #if 0
-    device_convolution_direct_v2_nchw_kcyx_nkhw
-        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(
-        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(in_nchw_desc,
-                                                              in_nchw,
-                                                              wei_kcyx_desc,
-                                                              wei_kcyx,
-                                                              out_nkhw_desc,
-                                                              out_nkhw_device,
-                                                              LeftPads{},
-                                                              RightPads{},
-                                                              nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(
-        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(
-        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
-        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 1
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,
@@ -618,30 +590,6 @@ int main(int argc, char* argv[])
                                                         LeftPads{},
                                                         RightPads{},
                                                         nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_fp16(in_nchw_desc,
-                                                              in_nchw,
-                                                              wei_kcyx_desc,
-                                                              wei_kcyx,
-                                                              out_nkhw_desc,
-                                                              out_nkhw_device,
-                                                              ConvStrides{},
-                                                              ConvDilations{},
-                                                              LeftPads{},
-                                                              RightPads{},
-                                                              nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw(in_nchw_desc,
-                                                                     in_nchw,
-                                                                     wei_kcyx_desc,
-                                                                     wei_kcyx,
-                                                                     out_nkhw_desc,
-                                                                     out_nkhw_device,
-                                                                     ConvStrides{},
-                                                                     ConvDilations{},
-                                                                     LeftPads{},
-                                                                     RightPads{},
-                                                                     nrepeat);
 #endif
    if(do_verification)

--- a/external/include/half.hpp
+++ b/external/include/half.hpp