initial implementation for nchw v4r4 padding

2c93b305 · Chao Liu · 53094f7f · 2c93b305 · 2c93b305 · 2c93b305
Commit 2c93b305 authored Sep 15, 2019 by Chao Liu
12 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -100,10 +100,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
        constexpr index_t E = C * Y * X;

        // sanity-check for vectorized memory load
-        static_assert(ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1,
-                      "wrong! global vector load of input tensor is wrong");
-
-        static_assert((X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0),
+        static_assert((Ho == 1 || ConvStrideW % InBlockCopySrcDataPerRead_B == 0) &&
+                          (X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0),
                      "wrong! aligment requirement for vectorized global load of input tensor will "
                      "be violated");


--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -100,10 +100,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
        constexpr index_t E = C * Y * X;

        // sanity-check for vectorized memory load
-        static_assert(ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1,
-                      "wrong! global vector load of input tensor is wrong");
-
-        static_assert((X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0),
+        static_assert((Ho == 1 || ConvStrideW % InBlockCopySrcDataPerRead_B == 0) &&
+                          (X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0),
                      "wrong! aligment requirement for vectorized global load of input tensor will "
                      "be violated");


--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -107,10 +107,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
        constexpr index_t E = C * Y * X;

        // sanity-check for vectorized memory load
-        static_assert(ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1,
-                      "wrong! global vector load of input tensor is wrong");
-
-        static_assert((X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0),
+        static_assert((Ho == 1 || ConvStrideW % InBlockCopySrcDataPerRead_B == 0) &&
+                          (X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0),
                      "wrong! aligment requirement for vectorized global load of input tensor will "
                      "be violated");


--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -83,7 +83,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
        constexpr index_t E = C * Y * X;
        constexpr index_t B = N * Ho * Wo;

-        static_assert((X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0),
+        // sanity-check for vectorized memory load
+        static_assert((Ho == 1 || ConvStrideW % InBlockCopyDataPerAccess_B == 0) &&
+                          (X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0),
                      "wrong! aligment requirement for vectorized global load of input tensor will "
                      "be violated");


--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -83,7 +83,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
        constexpr index_t E = C * Y * X;
        constexpr index_t B = N * Ho * Wo;

-        static_assert((X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0),
+        // sanity-check for vectorized memory load
+        static_assert((Ho == 1 || ConvStrideW % InBlockCopyDataPerAccess_B == 0) &&
+                          (X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0),
                      "wrong! aligment requirement for vectorized global load of input tensor will "
                      "be violated");


--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -190,14 +190,16 @@ struct TensorCoordinate_v2
    __host__ __device__ static constexpr auto
    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
    {
-        return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>();
+        return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
+            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
    }

    template <typename... Ts>
    __host__ __device__ static constexpr auto
    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
    {
-        return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>();
+        return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
+            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
    }

    public:

--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -187,8 +187,28 @@ struct TransformedTensorDescriptor
                          nTransform == UpDimensionIds::Size(),
                      "wrong! # of transformations not the same");

-        // TODO: sanity check: LowDimensionIds should include all low-dimensions,
+        // sanity check:
+        //   LowDimensionIds should include all low-dimensions,
        //   UpDimensionIds should include all up-dimensions
+        using mingled_up_dimension_ids =
+            decltype(unpack(lambda_merge_sequences{}, UpDimensionIds{}));
+
+        using sorted_up_dimension_ids =
+            typename sequence_sort<mingled_up_dimension_ids, math::less<index_t>>::type;
+
+        static_assert(sorted_up_dimension_ids::Size() == nDimUp &&
+                          is_valid_sequence_map<sorted_up_dimension_ids>{},
+                      "wrong! UpDimensionIds is not configured correctly");
+
+        using mingled_low_dimension_ids =
+            decltype(unpack(lambda_merge_sequences{}, LowDimensionIds{}));
+
+        using sorted_low_dimension_ids =
+            typename sequence_sort<mingled_low_dimension_ids, math::less<index_t>>::type;
+
+        static_assert(sorted_low_dimension_ids::Size() == nDimLow &&
+                          is_valid_sequence_map<sorted_low_dimension_ids>{},
+                      "wrong! LowDimensionIds is not configured correctly");

        // TODO: sanity check: while a up-dimension could be associated with multille
        //   transformation, a low-dimension should be associated with only one transformation

--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
+
+template <class T,
+          class InDesc,
+          class WeiDesc,
+          class OutDesc,
+          class ConvStrides,
+          class ConvDilations,
+          class LeftPads,
+          class RightPads>
+void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
+                                                                 const Tensor<T>& in_nchw,
+                                                                 WeiDesc,
+                                                                 const Tensor<T>& wei_kcyx,
+                                                                 OutDesc,
+                                                                 Tensor<T>& out_nkhw,
+                                                                 ConvStrides,
+                                                                 ConvDilations,
+                                                                 LeftPads,
+                                                                 RightPads,
+                                                                 index_t nrepeat)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
+
+    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
+    constexpr index_t K  = out_nkhw_desc.GetLength(I1);
+    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
+    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+
+#if 1
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_B            = Sequence<4, 1>;
+    using InBlockCopyClusterLengths_E_B        = Sequence<2, 128>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+
+    constexpr index_t InBlockCopyDataPerAccess_B = 1;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+
+    constexpr index_t OutThreadCopyDataPerAccess_B = 1;
+#elif 1
+    // 1x1 filter, 8x8 image
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_B            = Sequence<1, 4>;
+    using InBlockCopyClusterLengths_E_B        = Sequence<8, 32>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+
+    constexpr index_t InBlockCopyDataPerAccess_B = 4;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+
+    constexpr index_t OutThreadCopyDataPerAccess_B = 4;
+#elif 0
+    // 1x1 filter, 14x14 image
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_B            = Sequence<2, 2>;
+    using InBlockCopyClusterLengths_E_B        = Sequence<4, 64>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+
+    constexpr index_t InBlockCopyDataPerAccess_B = 2;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+
+    constexpr index_t OutThreadCopyDataPerAccess_B = 2;
+#endif
+
+    constexpr index_t B = N * Ho * Wo;
+
+    constexpr index_t GridSize =
+        ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded<
+        GridSize,
+        BlockSize,
+        T,
+        decltype(in_nchw_desc),
+        decltype(wei_kcyx_desc),
+        decltype(out_nkhw_desc),
+        ConvStrides,
+        ConvDilations,
+        LeftPads,
+        RightPads,
+        BPerBlock,
+        KPerBlock,
+        EPerBlock,
+        GemmMPerThreadSubC,
+        GemmNPerThreadSubC,
+        GemmMLevel0Cluster,
+        GemmNLevel0Cluster,
+        GemmMLevel1Cluster,
+        GemmNLevel1Cluster,
+        GemmKPerThreadLoop,
+        GemmDataPerReadA,
+        GemmDataPerReadB,
+        InBlockCopySubLengths_E_B,
+        InBlockCopyClusterLengths_E_B,
+        InBlockCopyThreadClusterArrangeOrder,
+        InBlockCopySrcAccessOrder,
+        InBlockCopyDstAccessOrder,
+        InBlockCopyDataPerAccess_B,
+        WeiBlockCopySubLengths_E_K,
+        WeiBlockCopyClusterLengths_E_K,
+        WeiBlockCopyThreadClusterArrangeOrder,
+        WeiBlockCopySrcAccessOrder,
+        WeiBlockCopyDstAccessOrder,
+        WeiBlockCopySrcDataPerRead_E,
+        WeiBlockCopyDstDataPerWrite_K,
+        OutThreadCopyDataPerAccess_B>{};
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -19,6 +19,7 @@
 //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"

 struct GeneratorTensor_1
 {

--- a/driver/src/driver.cu
+++ b/driver/src/driver.cu
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include "config.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "device.hpp"
-#include "conv_common.hpp"
-#include "host_conv.hpp"
-#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
-#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp"
-//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
-//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
-//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
-//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
-//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-
-struct GeneratorTensor_1
-{
-    template <class... Is>
-    double operator()(Is... is)
-    {
-        return 1;
-    }
-};
-
-struct GeneratorTensor_2
-{
-    int min_value = 0;
-    int max_value = 1;
-
-    template <class... Is>
-    double operator()(Is...)
-    {
-        return (std::rand() % (max_value - min_value)) + min_value;
-    }
-};
-
-struct GeneratorTensor_3
-{
-    template <class... Is>
-    double operator()(Is... is)
-    {
-        std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}};
-
-        auto f_acc = [](auto a, auto b) { return 100 * a + b; };
-
-        return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc);
-    }
-};
-
-struct GeneratorTensor_Checkboard
-{
-    template <class... Ts>
-    double operator()(Ts... Xs) const
-    {
-        std::array<index_t, sizeof...(Ts)> dims = {{Xs...}};
-        return std::accumulate(dims.begin(),
-                               dims.end(),
-                               true,
-                               [](bool init, index_t x) -> int { return init != (x % 2); })
-                   ? 1
-                   : -1;
-    }
-};
-
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-
-#if 0
-    constexpr index_t N  = 32;
-    constexpr index_t C  = 8;
-    constexpr index_t HI = 1;
-    constexpr index_t WI = 1;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<0, 0>;
-#elif 1
-    // 3x3, 34x34
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 34;
-    constexpr index_t WI = 34;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 1536;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 2048;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 7x7 image
-    // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 832;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 1280;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 14x14 image
-    // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 512;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61%
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 1536;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 28x28 image
-    // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 28;
-    constexpr index_t WI = 28;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 7x7 image
-    // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 832;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
-    // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 288;
-    constexpr index_t HI = 35;
-    constexpr index_t WI = 35;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 1
-    // 1x1 filter, 17x17 input
-    // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 768;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 14x14 image
-    // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 528;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 14x14 image
-    // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 528;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 7x7 image
-    // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52%
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 832;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#endif
-
-    auto in_nchw_desc  = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
-    auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
-    auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
-        in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
-
-    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
-    ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
-    ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
-
-    using in_data_t  = float;
-    using out_data_t = float;
-    Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
-    Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
-    Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
-    Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
-
-    std::size_t num_thread = std::thread::hardware_concurrency();
-
-    if(argc != 3)
-    {
-        printf("arg1: do_verification, arg2: nrepeat\n");
-        exit(1);
-    }
-
-    bool do_verification = atoi(argv[1]);
-    index_t nrepeat      = atoi(argv[2]);
-
-    if(do_verification)
-    {
-#if 0
-        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-#elif 0
-        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-#elif 0
-        in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-#elif 1
-        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-#elif 0
-        in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
-
-        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
-#endif
-    }
-
-#if 0
-    device_convolution_direct_v2_nchw_kcyx_nkhw
-        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(
-        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(in_nchw_desc,
-                                                              in_nchw,
-                                                              wei_kcyx_desc,
-                                                              wei_kcyx,
-                                                              out_nkhw_desc,
-                                                              out_nkhw_device,
-                                                              LeftPads{},
-                                                              RightPads{},
-                                                              nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(
-        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(
-        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
-        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
-                                                         in_nchw,
-                                                         wei_kcyx_desc,
-                                                         wei_kcyx,
-                                                         out_nkhw_desc,
-                                                         out_nkhw_device,
-                                                         ConvStrides{},
-                                                         ConvDilations{},
-                                                         nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc,
-                                                                in_nchw,
-                                                                wei_kcyx_desc,
-                                                                wei_kcyx,
-                                                                out_nkhw_desc,
-                                                                out_nkhw_device,
-                                                                ConvStrides{},
-                                                                ConvDilations{},
-                                                                LeftPads{},
-                                                                RightPads{},
-                                                                nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc,
-                                                         in_nchw,
-                                                         wei_kcyx_desc,
-                                                         wei_kcyx,
-                                                         out_nkhw_desc,
-                                                         out_nkhw_device,
-                                                         ConvStrides{},
-                                                         ConvDilations{},
-                                                         nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(in_nchw_desc,
-                                                         in_nchw,
-                                                         wei_kcyx_desc,
-                                                         wei_kcyx,
-                                                         out_nkhw_desc,
-                                                         out_nkhw_device,
-                                                         ConvStrides{},
-                                                         ConvDilations{},
-                                                         nrepeat);
-#elif 1
-    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
-                                                         in_nchw,
-                                                         wei_kcyx_desc,
-                                                         wei_kcyx,
-                                                         out_nkhw_desc,
-                                                         out_nkhw_device,
-                                                         ConvStrides{},
-                                                         ConvDilations{},
-                                                         nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(in_nchw_desc,
-                                                                in_nchw,
-                                                                wei_kcyx_desc,
-                                                                wei_kcyx,
-                                                                out_nkhw_desc,
-                                                                out_nkhw_device,
-                                                                ConvStrides{},
-                                                                ConvDilations{},
-                                                                LeftPads{},
-                                                                RightPads{},
-                                                                nrepeat);
-#endif
-
-    if(do_verification)
-    {
-#if 1
-        if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
-           ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
-        {
-            host_winograd_3x3_convolution(
-                in_nchw, wei_kcyx, out_nkhw_host, LeftPads{}, RightPads{});
-        }
-        else
-#endif
-        {
-            host_direct_convolution(in_nchw,
-                                    wei_kcyx,
-                                    out_nkhw_host,
-                                    ConvStrides{},
-                                    ConvDilations{},
-                                    LeftPads{},
-                                    RightPads{});
-        }
-        check_error(out_nkhw_host, out_nkhw_device);
-
-#if 0
-        LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
-        LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
-        LogRange(std::cout << "out_nkhw_host  : ", out_nkhw_host.mData, ",") << std::endl;
-        LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
-#endif
-    }
-}
--- a/driver/src/driver.cu
+++ b/driver/src/driver.cu
+driver.cpp
\ No newline at end of file