adding col2im

ad3ac5cc · Chao Liu · 895e87c1 · ad3ac5cc · ad3ac5cc · ad3ac5cc
Commit ad3ac5cc authored Nov 15, 2019 by Chao Liu
8 changed files
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -15,10 +15,14 @@ install(TARGETS host LIBRARY DESTINATION lib)


 if(DEVICE_BACKEND STREQUAL "AMD")
-    set(DRIVER_SOURCE src/conv_driver.cpp)
+    set(CONV_SOURCE src/conv_driver.cpp)
+    set(COL2IM_SOURCE src/col2im_driver.cpp)
 elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
-    set(DRIVER_SOURCE src/conv_driver.cu)
+    set(CONV_SOURCE src/conv_driver.cu)
+    set(COL2IM_SOURCE src/col2im_driver.cu)
 endif()

-add_executable(conv ${DRIVER_SOURCE}) 
+add_executable(conv ${CONV_SOURCE}) 
+add_executable(col2im ${COL2IM_SOURCE}) 
 target_link_libraries(conv PRIVATE host)
+target_link_libraries(col2im PRIVATE host)
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
@@ -46,7 +46,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());

-#if 1
+#if 0
    // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
    constexpr index_t BlockSize = 256;

@@ -120,7 +120,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,

    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 1
+#elif 0
    // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
    constexpr index_t BlockSize = 256;

@@ -157,6 +157,42 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,

    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 2;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#elif 1
+    constexpr index_t BlockSize = 64;
+
+    constexpr index_t BPerBlock = 16;
+    constexpr index_t KPerBlock = 32;
+    constexpr index_t EPerBlock = 4;
+
+    constexpr index_t GemmNRepeat = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 1;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 2, 1, 4>;
+    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<4, 1, 16, 1>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 2, 1, 3>; // [E, B, N1, N2]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
+
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<1, 2>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 16>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 2;
 #endif

    constexpr index_t N1 = GemmNRepeat;

--- a/driver/include/device_tensor.hpp
+++ b/driver/include/device_tensor.hpp
@@ -7,8 +7,8 @@
 template <typename ConstTensorDesc, std::size_t... Is>
 auto make_TensorDescriptor_impl(ConstTensorDesc, std::integer_sequence<std::size_t, Is...>)
 {
-    std::initializer_list<std::size_t> lengths = {ConstTensorDesc::GetLength(Is)...};
-    std::initializer_list<std::size_t> strides = {ConstTensorDesc::GetStride(Is)...};
+    std::initializer_list<std::size_t> lengths = {ConstTensorDesc::GetLengths()[Is]...};
+    std::initializer_list<std::size_t> strides = {ConstTensorDesc::GetStrides()[Is]...};

    return TensorDescriptor(lengths, strides);
 }

--- a/driver/include/host_col2im.hpp
+++ b/driver/include/host_col2im.hpp
+#pragma once
+#include "tensor.hpp"
+
+template <typename T,
+          typename FilterSizes,
+          typename OutputSizes,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void host_col2im(const Tensor<T>& in_eb,
+                 Tensor<T>& in_nchw,
+                 FilterSizes,
+                 OutputSizes,
+                 ConvStrides,
+                 ConvDilations,
+                 LeftPads,
+                 RightPads)
+{
+    using namespace ck;
+
+    int N  = in_nchw.mDesc.GetLengths()[0];
+    int C  = in_nchw.mDesc.GetLengths()[1];
+    int HI = in_nchw.mDesc.GetLengths()[2];
+    int WI = in_nchw.mDesc.GetLengths()[3];
+
+    int Y = FilterSizes{}[0];
+    int X = FilterSizes{}[1];
+
+    int HO = OutputSizes{}[0];
+    int WO = OutputSizes{}[1];
+
+    auto f = [&](auto n, auto c, auto hi, auto wi) {
+        double v = 0;
+
+        for(int y = 0; y < Y; ++y)
+        {
+            int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
+
+            if(h_tmp % ConvStrides{}[0] == 0)
+            {
+                int ho = h_tmp / ConvStrides{}[0];
+
+                for(int x = 0; x < X; ++x)
+                {
+                    int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
+
+                    if(w_tmp % ConvStrides{}[1] == 0)
+                    {
+                        int wo = w_tmp / ConvStrides{}[1];
+
+                        int e = c * (Y * X) + y * X + x;
+                        int b = n * (HO * WO) + ho * WO + wo;
+
+                        v += in_eb(e, b);
+                    }
+                }
+            }
+        }
+
+        in_nchw(n, c, hi, wi) = v;
+    };
+
+    auto f_par = make_ParallelTensorFunctor(f,
+                                            in_nchw.mDesc.GetLengths()[0],
+                                            in_nchw.mDesc.GetLengths()[1],
+                                            in_nchw.mDesc.GetLengths()[2],
+                                            in_nchw.mDesc.GetLengths()[3]);
+
+    f_par(std::thread::hardware_concurrency());
+}
--- a/driver/src/col2im_driver.cpp
+++ b/driver/src/col2im_driver.cpp
@@ -4,30 +4,35 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include "config.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "print_array.hpp"
 #include "print_sequence.hpp"
 #include "device.hpp"
 #include "tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "conv_common.hpp"
+#include "host_col2im.hpp"
 //#include "device_col2im.hpp"

 int main(int argc, char* argv[])
 {
    using namespace ck;

-#if 0
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
+#if 1
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 1;
    constexpr index_t HI = 17;
    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 7;
+    constexpr index_t K  = 1;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

-    using LeftPads  = Sequence<0, 3>;
-    using RightPads = Sequence<0, 3>;
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
 #elif 0
    // 3x3, 34x34
    constexpr index_t N  = 64;
@@ -298,43 +303,32 @@ int main(int argc, char* argv[])
    using RightPads = Sequence<0, 3>;
 #endif

-#if 0
-    auto in_nchw_desc  = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
-    auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
-    auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor_deprecated(
+    constexpr auto in_nchw_desc  = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
+    constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
        in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});

-    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
-    ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
-    ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
-    print_sequence("LeftPads", LeftPads{});
-    print_sequence("RightPads", RightPads{});
-    print_sequence("ConvStrides", ConvStrides{});
-    print_sequence("ConvDilations", ConvDilations{});
+    constexpr index_t HO = out_nkhw_desc.GetLengths()[2];
+    constexpr index_t WO = out_nkhw_desc.GetLengths()[3];

-    using in_data_t  = float;
-    using out_data_t = float;
-    Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
-    Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
-    Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
-    Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
-#else
-    auto in_nchw_desc  = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
-    auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
-    auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
-        in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
+    auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});

-    auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * Ho * Wo>{});
+    using FilterSizes = Sequence<Y, X>;
+    using OutputSizes = Sequence<HO, WO>;

    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
-    ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
-    ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
    ostream_ConstantTensorDescriptor(in_eb_desc, std::cout << "in_eb_desc: ");
+    print_sequence("FilterSizes", FilterSizes{});
+    print_sequence("OutputSizes", OutputSizes{});
+    print_sequence("LeftPads", LeftPads{});
    print_sequence("LeftPads", LeftPads{});
    print_sequence("RightPads", RightPads{});
    print_sequence("ConvStrides", ConvStrides{});
    print_sequence("ConvDilations", ConvDilations{});
-#endif
+
+    Tensor<float> in_eb(make_TensorDescriptor(in_eb_desc));
+    Tensor<float> in_nchw_host(make_TensorDescriptor(in_nchw_desc));
+    Tensor<float> in_nchw_device(make_TensorDescriptor(in_nchw_desc));

    std::size_t num_thread = std::thread::hardware_concurrency();

@@ -349,36 +343,44 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
+#if 1
+        in_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+#else
        in_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+#endif
    }

-    device_col2im(in_nchw_desc,
-                  in_nchw,
-                  in_eb_desc,
+#if 0
+    device_col2im(in_eb_desc,
                  in_eb,
+                  in_nchw_desc,
+                  in_nchw_device,
+                  FilterSizes{},
+                  OutputSizes{},
                  ConvStrides{},
                  ConvDilations{},
                  LeftPads{},
                  RightPads{},
                  nrepeat);
+#endif

    if(do_verification)
    {
-        host_direct_convolution(in_nchw,
-                                wei_kcyx,
-                                out_nkhw_host,
-                                ConvStrides{},
-                                ConvDilations{},
-                                LeftPads{},
-                                RightPads{});
-
-        check_error(out_nkhw_host, out_nkhw_device);
-
-#if 0
-        LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
-        LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
-        LogRange(std::cout << "out_nkhw_host  : ", out_nkhw_host.mData, ",") << std::endl;
-        LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
+        host_col2im(in_eb,
+                    in_nchw_host,
+                    FilterSizes{},
+                    OutputSizes{},
+                    ConvStrides{},
+                    ConvDilations{},
+                    LeftPads{},
+                    RightPads{});
+
+        check_error(in_nchw_host, in_nchw_device);
+
+#if 1
+        LogRange(std::cout << "in_eb : ", in_eb.mData, ",") << std::endl;
+        LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl;
+        LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl;
 #endif
    }
 }
--- a/driver/src/col2im_driver.cu
+++ b/driver/src/col2im_driver.cu
-col2im_driver.cpp
\ No newline at end of file
--- a/driver/src/col2im_driver.cu
+++ b/driver/src/col2im_driver.cu
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include "config.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "print_array.hpp"
+#include "print_sequence.hpp"
+#include "device.hpp"
+#include "tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "conv_common.hpp"
+#include "host_col2im.hpp"
+//#include "device_col2im.hpp"
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+#if 1
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 1;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 1;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
+#elif 0
+    // 3x3, 34x34
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 34;
+    constexpr index_t WI = 34;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 1536;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 2048;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 7x7 image
+    // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 832;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 1280;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 14x14 image
+    // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 512;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61%
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 1536;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 28x28 image
+    // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 28;
+    constexpr index_t WI = 28;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 7x7 image
+    // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 832;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 17x17 input
+    // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 768;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 14x14 image
+    // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 528;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 14x14 image
+    // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 528;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 7x7 image
+    // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 832;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
+    // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 288;
+    constexpr index_t HI = 35;
+    constexpr index_t WI = 35;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<2, 2>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 5x5 filter, 2x2 pad, 7x7 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 48;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 5;
+    constexpr index_t X  = 5;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<2, 2>;
+    using RightPads = Sequence<2, 2>;
+#elif 0
+    // 7x1 filter, 3x0 pad, 17x17 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 7;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<3, 0>;
+    using RightPads = Sequence<3, 0>;
+#elif 1
+    // 1x7 filter, 0x3 pad, 17x17 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 7;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 3>;
+    using RightPads = Sequence<0, 3>;
+#endif
+
+    constexpr auto in_nchw_desc  = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
+    constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
+        in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
+
+    constexpr index_t HO = out_nkhw_desc.GetLengths()[2];
+    constexpr index_t WO = out_nkhw_desc.GetLengths()[3];
+
+    auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});
+
+    using FilterSizes = Sequence<Y, X>;
+    using OutputSizes = Sequence<HO, WO>;
+
+    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
+    ostream_ConstantTensorDescriptor(in_eb_desc, std::cout << "in_eb_desc: ");
+    print_sequence("FilterSizes", FilterSizes{});
+    print_sequence("OutputSizes", OutputSizes{});
+    print_sequence("LeftPads", LeftPads{});
+    print_sequence("LeftPads", LeftPads{});
+    print_sequence("RightPads", RightPads{});
+    print_sequence("ConvStrides", ConvStrides{});
+    print_sequence("ConvDilations", ConvDilations{});
+
+    Tensor<float> in_eb(make_TensorDescriptor(in_eb_desc));
+    Tensor<float> in_nchw_host(make_TensorDescriptor(in_nchw_desc));
+    Tensor<float> in_nchw_device(make_TensorDescriptor(in_nchw_desc));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(argc != 3)
+    {
+        printf("arg1: do_verification, arg2: nrepeat\n");
+        exit(1);
+    }
+
+    bool do_verification = atoi(argv[1]);
+    index_t nrepeat      = atoi(argv[2]);
+
+    if(do_verification)
+    {
+#if 1
+        in_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+#else
+        in_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+#endif
+    }
+
+#if 0
+    device_col2im(in_eb_desc,
+                  in_eb,
+                  in_nchw_desc,
+                  in_nchw_device,
+                  FilterSizes{},
+                  OutputSizes{},
+                  ConvStrides{},
+                  ConvDilations{},
+                  LeftPads{},
+                  RightPads{},
+                  nrepeat);
+#endif
+
+    if(do_verification)
+    {
+        host_col2im(in_eb,
+                    in_nchw_host,
+                    FilterSizes{},
+                    OutputSizes{},
+                    ConvStrides{},
+                    ConvDilations{},
+                    LeftPads{},
+                    RightPads{});
+
+        check_error(in_nchw_host, in_nchw_device);
+
+#if 1
+        LogRange(std::cout << "in_eb : ", in_eb.mData, ",") << std::endl;
+        LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl;
+        LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl;
+#endif
+    }
+}
--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -29,20 +29,20 @@ int main(int argc, char* argv[])
 {
    using namespace ck;

-#if 0
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 7;
+#if 1
+    constexpr index_t N  = 8;
+    constexpr index_t C  = 32;
+    constexpr index_t HI = 28;
+    constexpr index_t WI = 28;
+    constexpr index_t K  = 32;
+    constexpr index_t Y  = 5;
+    constexpr index_t X  = 5;

    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
+    using ConvDilations = Sequence<2, 2>;

-    using LeftPads  = Sequence<0, 3>;
-    using RightPads = Sequence<0, 3>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
 #elif 0
    // 3x3, 34x34
    constexpr index_t N  = 64;
@@ -393,7 +393,7 @@ int main(int argc, char* argv[])
 #elif 0
    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
+#elif 1
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
                                                                    in_nchw,
                                                                    wei_kcyx_desc,