backward data (#7)

* enabled atomic add in tensor copy * added gridwise GEMM * added backward data conv using GEMM + atomic * added backward data conv using GEMM, no atomic

backward data (#7)
* enabled atomic add in tensor copy * added gridwise GEMM * added backward data conv using GEMM + atomic * added backward data conv using GEMM, no atomic
8f5f6496 · Chao Liu · GitHub · 31ded4ac · 8f5f6496 · 8f5f6496
Unverified Commit 8f5f6496 authored Dec 03, 2019 by Chao Liu Committed by GitHub Dec 03, 2019
11 changed files
--- a/driver/src/col2im_driver.cpp
+++ b/driver/src/col2im_driver.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include "config.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "print_array.hpp"
+#include "print_sequence.hpp"
+#include "device.hpp"
+#include "tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "conv_common.hpp"
+#include "host_col2im.hpp"
+#include "device_col2im_eb_nchw.hpp"
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+#if 1
+    constexpr index_t N  = 2;
+    constexpr index_t C  = 8;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 4;
+    constexpr index_t X  = 4;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<2, 2>;
+#elif 0
+    // 3x3, 34x34
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 34;
+    constexpr index_t WI = 34;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 1536;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 2048;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 7x7 image
+    // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 832;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 1280;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 14x14 image
+    // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 512;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61%
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 1536;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 28x28 image
+    // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 28;
+    constexpr index_t WI = 28;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 7x7 image
+    // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 832;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 17x17 input
+    // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 768;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 14x14 image
+    // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 528;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 14x14 image
+    // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 528;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 7x7 image
+    // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 832;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
+    // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 288;
+    constexpr index_t HI = 35;
+    constexpr index_t WI = 35;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<2, 2>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 5x5 filter, 2x2 pad, 7x7 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 48;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 5;
+    constexpr index_t X  = 5;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<2, 2>;
+    using RightPads = Sequence<2, 2>;
+#elif 0
+    // 7x1 filter, 3x0 pad, 17x17 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 7;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<3, 0>;
+    using RightPads = Sequence<3, 0>;
+#elif 1
+    // 1x7 filter, 0x3 pad, 17x17 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 7;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 3>;
+    using RightPads = Sequence<0, 3>;
+#endif
+
+    constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
+    constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
+        img_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
+
+    constexpr index_t HO = out_nkhw_desc.GetLengths()[2];
+    constexpr index_t WO = out_nkhw_desc.GetLengths()[3];
+
+    constexpr auto col_eb_desc =
+        make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});
+
+    using FilterSizes = Sequence<Y, X>;
+    using OutputSizes = Sequence<HO, WO>;
+
+    ostream_ConstantTensorDescriptor(col_eb_desc, std::cout << "col_eb_desc: ");
+    ostream_ConstantTensorDescriptor(img_nchw_desc, std::cout << "img_nchw_desc: ");
+    print_sequence("FilterSizes", FilterSizes{});
+    print_sequence("OutputSizes", OutputSizes{});
+    print_sequence("LeftPads", LeftPads{});
+    print_sequence("LeftPads", LeftPads{});
+    print_sequence("RightPads", RightPads{});
+    print_sequence("ConvStrides", ConvStrides{});
+    print_sequence("ConvDilations", ConvDilations{});
+
+    Tensor<float> col_eb(make_TensorDescriptor(col_eb_desc));
+    Tensor<float> img_nchw_host(make_TensorDescriptor(img_nchw_desc));
+    Tensor<float> img_nchw_device(make_TensorDescriptor(img_nchw_desc));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(argc != 3)
+    {
+        printf("arg1: do_verification, arg2: nrepeat\n");
+        exit(1);
+    }
+
+    bool do_verification = atoi(argv[1]);
+    std::size_t nrepeat  = atoi(argv[2]);
+
+    if(do_verification)
+    {
+#if 0
+        col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+#else
+        col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+#endif
+    }
+
+    device_col2im_eb_nchw(col_eb_desc,
+                          col_eb,
+                          img_nchw_desc,
+                          img_nchw_device,
+                          FilterSizes{},
+                          OutputSizes{},
+                          ConvStrides{},
+                          ConvDilations{},
+                          LeftPads{},
+                          RightPads{},
+                          nrepeat);
+
+    if(do_verification)
+    {
+        host_col2im(col_eb,
+                    img_nchw_host,
+                    FilterSizes{},
+                    OutputSizes{},
+                    ConvStrides{},
+                    ConvDilations{},
+                    LeftPads{},
+                    RightPads{});
+
+        check_error(img_nchw_host, img_nchw_device);
+
+#if 0
+        LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
+        LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
+        LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
+#endif
+    }
+}
--- a/driver/src/col2im_driver.cu
+++ b/driver/src/col2im_driver.cu
+col2im_driver.cpp
\ No newline at end of file
--- a/driver/src/conv_bwd_data_driver.cpp
+++ b/driver/src/conv_bwd_data_driver.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include "config.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "print_array.hpp"
+#include "print_sequence.hpp"
+#include "device.hpp"
+#include "tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "conv_common.hpp"
+#include "host_conv_bwd_data.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp"
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+#if 0
+    constexpr index_t N  = 8;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 16;
+    constexpr index_t WI = 16;
+    constexpr index_t K  = 8;
+    constexpr index_t Y  = 2;
+    constexpr index_t X  = 2;
+
+    using ConvStrides   = Sequence<4, 4>;
+    using ConvDilations = Sequence<2, 2>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 3x3, 34x34
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 34;
+    constexpr index_t WI = 34;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 1536;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 2048;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 7x7 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 832;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 1280;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 14x14 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 512;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 8x8 image
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 1536;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 28x28 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 28;
+    constexpr index_t WI = 28;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 7x7 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 832;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 17x17 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 768;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 14x14 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 528;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 14x14 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 528;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1 filter, 7x7 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 832;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 288;
+    constexpr index_t HI = 35;
+    constexpr index_t WI = 35;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<2, 2>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 5x5 filter, 2x2 pad, 7x7 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 48;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 5;
+    constexpr index_t X  = 5;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<2, 2>;
+    using RightPads = Sequence<2, 2>;
+#elif 0
+    // 7x1 filter, 3x0 pad, 17x17 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 7;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<3, 0>;
+    using RightPads = Sequence<3, 0>;
+#elif 1
+    // 1x7 filter, 0x3 pad, 17x17 input
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 7;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 3>;
+    using RightPads = Sequence<0, 3>;
+#endif
+
+    constexpr auto in_nchw_desc  = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
+    constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
+        in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
+
+    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
+    ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
+    ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
+    print_sequence("LeftPads", LeftPads{});
+    print_sequence("LeftPads", LeftPads{});
+    print_sequence("RightPads", RightPads{});
+    print_sequence("ConvStrides", ConvStrides{});
+    print_sequence("ConvDilations", ConvDilations{});
+
+    Tensor<float> in_nchw_device(make_TensorDescriptor(in_nchw_desc));
+    Tensor<float> in_nchw_host(make_TensorDescriptor(in_nchw_desc));
+    Tensor<float> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
+    Tensor<float> out_nkhw(make_TensorDescriptor(out_nkhw_desc));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(argc != 3)
+    {
+        printf("arg1: do_verification, arg2: nrepeat\n");
+        exit(1);
+    }
+
+    bool do_verification = atoi(argv[1]);
+    std::size_t nrepeat  = atoi(argv[2]);
+
+    if(do_verification)
+    {
+#if 0
+        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
+        out_nkhw.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
+#else
+        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+#endif
+    }
+
+#if 0
+    device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
+#elif 0
+    device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
+#else
+    device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
+#endif
+    (in_nchw_desc,
+     in_nchw_device,
+     wei_kcyx_desc,
+     wei_kcyx,
+     out_nkhw_desc,
+     out_nkhw,
+     ConvStrides{},
+     ConvDilations{},
+     LeftPads{},
+     RightPads{},
+     nrepeat);
+
+    if(do_verification)
+    {
+        host_direct_convolution_backward_data(in_nchw_host,
+                                              wei_kcyx,
+                                              out_nkhw,
+                                              ConvStrides{},
+                                              ConvDilations{},
+                                              LeftPads{},
+                                              RightPads{});
+
+        check_error(in_nchw_host, in_nchw_device);
+
+#if 0
+        LogRange(std::cout << "out_nkhw : ", out_nkhw.mData, ",") << std::endl;
+        LogRange(std::cout << "wei_kcyx : ", wei_kcyx.mData, ",") << std::endl;
+        LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl;
+        LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl;
+#endif
+    }
+}
--- a/driver/src/conv_bwd_data_driver.cu
+++ b/driver/src/conv_bwd_data_driver.cu
+conv_bwd_data_driver.cpp
\ No newline at end of file
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -8,8 +8,10 @@
 #include "print_array.hpp"
 #include "print_sequence.hpp"
 #include "device.hpp"
+#include "tensor_generator.hpp"
 #include "conv_common.hpp"
 #include "host_conv.hpp"
+#include "device_tensor.hpp"
 //#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
 //#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp"
@@ -23,73 +25,24 @@
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp"
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"

-struct GeneratorTensor_1
-{
-    template <class... Is>
-    double operator()(Is... is)
-    {
-        return 1;
-    }
-};
-
-struct GeneratorTensor_2
-{
-    int min_value = 0;
-    int max_value = 1;
-
-    template <class... Is>
-    double operator()(Is...)
-    {
-        return (std::rand() % (max_value - min_value)) + min_value;
-    }
-};
-
-struct GeneratorTensor_3
-{
-    template <class... Is>
-    double operator()(Is... is)
-    {
-        std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}};
-
-        auto f_acc = [](auto a, auto b) { return 10 * a + b; };
-
-        return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc);
-    }
-};
-
-struct GeneratorTensor_Checkboard
-{
-    template <class... Ts>
-    double operator()(Ts... Xs) const
-    {
-        std::array<index_t, sizeof...(Ts)> dims = {{Xs...}};
-        return std::accumulate(dims.begin(),
-                               dims.end(),
-                               true,
-                               [](bool init, index_t x) -> int { return init != (x % 2); })
-                   ? 1
-                   : -1;
-    }
-};
-
 int main(int argc, char* argv[])
 {
    using namespace ck;

 #if 0
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 7;
+    constexpr index_t N  = 8;
+    constexpr index_t C  = 32;
+    constexpr index_t HI = 28;
+    constexpr index_t WI = 28;
+    constexpr index_t K  = 32;
+    constexpr index_t Y  = 5;
+    constexpr index_t X  = 5;

    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
+    using ConvDilations = Sequence<2, 2>;

-    using LeftPads  = Sequence<0, 3>;
-    using RightPads = Sequence<0, 3>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
 #elif 0
    // 3x3, 34x34
    constexpr index_t N  = 64;
@@ -297,7 +250,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
    // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
    // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
    constexpr index_t N  = 128;
@@ -343,7 +296,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<3, 0>;
    using RightPads = Sequence<3, 0>;
-#elif 1
+#elif 0
    // 1x7 filter, 0x3 pad, 17x17 input
    constexpr index_t N  = 128;
    constexpr index_t C  = 128;
@@ -362,7 +315,7 @@ int main(int argc, char* argv[])

    auto in_nchw_desc  = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
    auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
-    auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
+    auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor_deprecated(
        in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});

    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
@@ -492,7 +445,7 @@ int main(int argc, char* argv[])
                                                                    ConvStrides{},
                                                                    ConvDilations{},
                                                                    nrepeat);
-#elif 0
+#elif 1
    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,

--- a/driver/src/conv_driver.cu
+++ b/driver/src/conv_driver.cu
+conv_driver.cpp
\ No newline at end of file
--- a/driver/src/driver.cu
+++ b/driver/src/driver.cu
-driver.cpp
\ No newline at end of file
--- a/driver/src/tensor.cpp
+++ b/driver/src/tensor.cpp
@@ -3,12 +3,14 @@

 #include "tensor.hpp"

-TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mLens(lens)
+template <typename X>
+TensorDescriptor::TensorDescriptor(std::vector<X> lens) : mLens(lens)
 {
    this->CalculateStrides();
 }

-TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides)
+template <typename X, typename Y>
+TensorDescriptor::TensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
    : mLens(lens), mStrides(strides)
 {
 }

--- a/script/compile-hip.sh
+++ b/script/compile-hip.sh
@@ -4,5 +4,5 @@
 export KMDUMPLLVM=1
 export KMDUMPDIR=$PWD

- make -j driver
+ make -j $1
 #/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm
--- a/script/docker-cuda.sh
+++ b/script/docker-cuda.sh
+WORKSPACE=$1
+echo "workspace: " $WORKSPACE
+sudo docker run  -it  -v $WORKSPACE:/root/workspace --group-add sudo --runtime=nvidia     asroy/cuda:10.1-cudnn7-devel-ubuntu18.04-latest /bin/bash
--- a/script/ds_read_offset.sh
+++ b/script/ds_read_offset.sh
-for((i=0;i<=4096;i=i+64))
-do
-    OFFSET=$i
-    echo "if(offset == $OFFSET)"
-    echo "{"
-    echo "    asm volatile(\"\\n \\"
-    echo "        ds_read_b128 %0, %1 offset:$OFFSET\n \\"
-    echo "        \""
-    echo "    : \"=v\"(r)"
-    echo "    : \"v\"(__to_local(lds)));"
-    echo "}"
-done