added col2im

6ed96265 · Chao Liu · ad3ac5cc · 6ed96265 · 6ed96265 · 6ed96265
Commit 6ed96265 authored Nov 17, 2019 by Chao Liu
11 changed files
--- a/composable_kernel/include/gridwise_operation_wrapper.hpp
+++ b/composable_kernel/include/gridwise_operation_wrapper.hpp
+#ifndef CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
+#define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
+
+template <typename GridwiseOp, typename... Xs>
+__global__ void run_gridwise_operation(GridwiseOp& gridwise_op, Xs... xs)
+{
+    gridwise_op.Run(xs...);
+}
+
+#endif
--- a/composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
+#ifndef CK_GRIDWISE_COL2IM_EB_NCHW_HPP
+#define CK_GRIDWISE_COL2IM_EB_NCHW_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_generic_tensor_slice_copy.hpp"
+
+namespace ck {
+
+// B = merge(N, Ho, Wo)
+template <index_t GridSize,
+          index_t BlockSize,
+          typename Float,
+          typename ColGlobalDesc,
+          typename ImgGlobalDesc,
+          typename FilterSizes,
+          typename OutputSizes,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads,
+          index_t EPerBlock,
+          index_t BPerBlock,
+          typename BlockCopySubLengths_E_B,
+          typename BlockCopyClusterLengths_E_B,
+          typename BlockCopyThreadClusterArrangeOrder,
+          typename BlockCopySrcAccessOrder,
+          typename BlockCopyDstAccessOrder,
+          index_t BlockCopyDataPerAccess_B>
+struct GridwiseCol2Im_eb_nchw
+{
+    __device__ void Run(const Float* const __restrict__ p_col_global,
+                        Float* const __restrict__ p_img_global) const
+    {
+        constexpr auto col_e_b_global_desc       = ColGlobalDesc{};
+        constexpr auto img_n_c_hi_wi_global_desc = ImgGlobalDesc{};
+
+        constexpr index_t N  = img_n_c_hi_wi_global_desc.GetLengths()[0];
+        constexpr index_t C  = img_n_c_hi_wi_global_desc.GetLengths()[1];
+        constexpr index_t Hi = img_n_c_hi_wi_global_desc.GetLengths()[2];
+        constexpr index_t Wi = img_n_c_hi_wi_global_desc.GetLengths()[3];
+
+        constexpr index_t Ho = OutputSizes{}[0];
+        constexpr index_t Wo = OutputSizes{}[1];
+
+        constexpr index_t Y = FilterSizes{}[0];
+        constexpr index_t X = FilterSizes{}[1];
+
+        constexpr index_t ConvStrideH = ConvStrides{}[0];
+        constexpr index_t ConvStrideW = ConvStrides{}[1];
+
+        constexpr index_t ConvDilationH = ConvDilations{}[0];
+        constexpr index_t ConvDilationW = ConvDilations{}[1];
+
+        constexpr index_t E = C * Y * X;
+        constexpr index_t B = N * Ho * Wo;
+
+        // sanity-check for vectorized memory load
+        static_assert((Wo == 1 || (ConvStrideW == 1 || BlockCopyDataPerAccess_B == 1)) &&
+                          (X == 1 || ConvDilationW % BlockCopyDataPerAccess_B == 0),
+                      "wrong! aligment requirement for vectorized global load of input tensor will "
+                      "be violated");
+
+        // divide block work by [E, B]
+        static_assert(E % EPerBlock == 0 && B % BPerBlock == 0,
+                      "wrong! cannot divide work evenly among block");
+
+        constexpr index_t EBlockWork = E / EPerBlock;
+        constexpr index_t BBlockWork = B / BPerBlock;
+
+        constexpr auto block_work_desc =
+            make_cluster_descriptor(Sequence<EBlockWork, BBlockWork>{});
+
+        const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
+
+        const index_t e_block_data_on_global = block_work_id[0] * EPerBlock;
+        const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
+
+        // construct img_eb_global_desc
+        constexpr auto img_n_c_hip_wip_global_desc = transform_tensor_descriptor(
+            img_n_c_hi_wi_global_desc,
+            make_tuple(
+                PassThrough<N>{}, PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
+
+        constexpr auto img_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
+            img_n_c_hip_wip_global_desc,
+            make_tuple(PassThrough<N>{},
+                       PassThrough<C>{},
+                       Embed<Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
+                       Embed<Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+        constexpr auto img_e_b_global_desc = transform_tensor_descriptor(
+            img_n_c_y_ho_x_wo_global_desc,
+            make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        // blockwise atomic accumulation
+        auto blockwise_copy = BlockwiseGenericTensorSliceCopy_v4<BlockSize,
+                                                                 decltype(col_e_b_global_desc),
+                                                                 decltype(img_e_b_global_desc),
+                                                                 Sequence<EPerBlock, BPerBlock>,
+                                                                 BlockCopySubLengths_E_B,
+                                                                 BlockCopyClusterLengths_E_B,
+                                                                 BlockCopyThreadClusterArrangeOrder,
+                                                                 BlockCopySrcAccessOrder,
+                                                                 BlockCopyDstAccessOrder,
+                                                                 1,
+                                                                 1,
+                                                                 BlockCopyDataPerAccess_B,
+                                                                 BlockCopyDataPerAccess_B>(
+            {e_block_data_on_global, b_block_data_on_global},
+            {e_block_data_on_global, b_block_data_on_global});
+
+        // blockwise copy
+        blockwise_copy.Run(p_col_global, p_img_global);
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -134,8 +134,18 @@ struct BlockwiseGenericTensorSliceCopy_v4
        }
        else
        {
+#if 0 // debug
            mThreadwiseStore.Run(
                p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
+#else
+            constexpr auto True = integral_constant<bool, true>{};
+
+            mThreadwiseStore.Run(p_thread_buffer,
+                                 p_block_dst,
+                                 thread_buffer_address_space,
+                                 block_dst_address_space,
+                                 True);
+#endif
        }
    }


--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -69,11 +69,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
    template <typename SrcData,
              typename DstData,
              AddressSpace SrcAddressSpace,
-              AddressSpace DstAddressSpace>
+              AddressSpace DstAddressSpace,
+              bool DoAtomicAdd = false>
    __device__ void Run(const SrcData* p_src,
                        DstData* p_dst,
                        integral_constant<AddressSpace, SrcAddressSpace>,
-                        integral_constant<AddressSpace, DstAddressSpace>) const
+                        integral_constant<AddressSpace, DstAddressSpace>,
+                        integral_constant<bool, DoAtomicAdd> do_atomic_add =
+                            integral_constant<bool, DoAtomicAdd>{}) const
    {
        using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
        using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
@@ -160,21 +163,27 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                //   has the same padding situation
                if(dst_coord.IsUpperIndexMappedToValidOffset())
                {
-                    static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
+                    static_if<!DoAtomicAdd>{}([&](auto) {
+                        static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
 #if CK_USE_AMD_BUFFER_ADDRESSING
-                        amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
-                            *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
-                            fwd(p_dst),
-                            dst_coord.GetOffset(),
-                            0);
+                            amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
+                                *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
+                                fwd(p_dst),
+                                dst_coord.GetOffset(),
+                                0);
 #else
-                        *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
-                            *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
+                            *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
+                                *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
 #endif
+                        }).Else([&](auto) {
+                            // dst can be all kinds of memory-space
+                            *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
+                                *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
+                        });
                    }).Else([&](auto) {
-                        // dst can be all kinds of memory-space
-                        *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
-                            *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
+                        atomicAdd(
+                            reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]),
+                            *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]));
                    });
                }
            }

--- a/driver/include/device_col2im_eb_nchw.hpp
+++ b/driver/include/device_col2im_eb_nchw.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "gridwise_col2im_eb_nchw.hpp"
+
+template <typename T,
+          typename ColDesc,
+          typename ImgDesc,
+          typename FilterSizes,
+          typename OutputSizes,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void device_col2im_eb_nchw(ColDesc,
+                           const Tensor<T>& col_eb,
+                           ImgDesc,
+                           Tensor<T>& img_nchw,
+                           FilterSizes,
+                           OutputSizes,
+                           ConvStrides,
+                           ConvDilations,
+                           LeftPads,
+                           RightPads,
+                           std::size_t nrepeat)
+{
+    using namespace ck;
+
+    constexpr auto col_eb_desc   = ColDesc{};
+    constexpr auto img_nchw_desc = ImgDesc{};
+
+    constexpr index_t N  = img_nchw_desc.GetLengths()[0];
+    constexpr index_t C  = img_nchw_desc.GetLengths()[1];
+    constexpr index_t Hi = img_nchw_desc.GetLengths()[2];
+    constexpr index_t Wi = img_nchw_desc.GetLengths()[3];
+
+    constexpr index_t E = col_eb_desc.GetLengths()[0];
+    constexpr index_t B = col_eb_desc.GetLengths()[1];
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem col_eb_device_buf(data_sz * col_eb.mDesc.GetElementSpace());
+    DeviceMem img_nchw_device_buf(data_sz * img_nchw.mDesc.GetElementSpace());
+
+    col_eb_device_buf.ToDevice(col_eb.mData.data());
+    img_nchw_device_buf.ToDevice(img_nchw.mData.data());
+
+#if 1
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t EPerBlock = 128;
+    constexpr index_t BPerBlock = 128;
+
+    using BlockCopySubLengths_E_B            = Sequence<8, 8>;
+    using BlockCopyClusterLengths_E_B        = Sequence<16, 16>;
+    using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using BlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using BlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+
+    constexpr index_t BlockCopyDataPerAccess_B = 1;
+#endif
+
+    constexpr index_t GridSize =
+        ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    constexpr auto gridwise_col2im = GridwiseCol2Im_eb_nchw<GridSize,
+                                                            BlockSize,
+                                                            T,
+                                                            ColDesc,
+                                                            ImgDesc,
+                                                            FilterSizes,
+                                                            OutputSizes,
+                                                            ConvStrides,
+                                                            ConvDilations,
+                                                            LeftPads,
+                                                            RightPads,
+                                                            EPerBlock,
+                                                            BPerBlock,
+                                                            BlockCopySubLengths_E_B,
+                                                            BlockCopyClusterLengths_E_B,
+                                                            BlockCopyThreadClusterArrangeOrder,
+                                                            BlockCopySrcAccessOrder,
+                                                            BlockCopyDstAccessOrder,
+                                                            BlockCopyDataPerAccess_B>{};
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_col2im),
+                                                          const T* const __restrict__,
+                                                          T* const __restrict__>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   gridwise_col2im,
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(col_eb_device_buf.GetDeviceBuffer())),
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(img_nchw_device_buf.GetDeviceBuffer())));
+
+        printf("Elapsed time : %f ms\n", time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    img_nchw_device_buf.FromDevice(img_nchw.mData.data());
+}
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -2,7 +2,7 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_operation_wrapper.hpp"
 #include "convolution_common.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"

@@ -221,13 +221,20 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,

    for(index_t i = 0; i < nrepeat; ++i)
    {
-        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
+                                                          const T* const __restrict__,
+                                                          const T* const __restrict__,
+                                                          T* const __restrict__>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+                                   gridwise_conv,
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));

        printf("Elapsed time : %f ms, %f TFlop/s\n",
               time,

--- a/driver/include/host_col2im.hpp
+++ b/driver/include/host_col2im.hpp
@@ -37,7 +37,7 @@ void host_col2im(const Tensor<T>& in_eb,
        {
            int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];

-            if(h_tmp % ConvStrides{}[0] == 0)
+            if(h_tmp >= 0 && h_tmp < HI && h_tmp % ConvStrides{}[0] == 0)
            {
                int ho = h_tmp / ConvStrides{}[0];

@@ -45,7 +45,7 @@ void host_col2im(const Tensor<T>& in_eb,
                {
                    int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];

-                    if(w_tmp % ConvStrides{}[1] == 0)
+                    if(w_tmp >= 0 && w_tmp < WI && w_tmp % ConvStrides{}[1] == 0)
                    {
                        int wo = w_tmp / ConvStrides{}[1];


--- a/driver/include/tensor_generator.hpp
+++ b/driver/include/tensor_generator.hpp
@@ -5,10 +5,12 @@

 struct GeneratorTensor_1
 {
+    int value = 1;
+
    template <class... Is>
    double operator()(Is... is)
    {
-        return 1;
+        return value;
    }
 };


--- a/driver/src/col2im_driver.cpp
+++ b/driver/src/col2im_driver.cpp
@@ -13,26 +13,26 @@
 #include "device_tensor.hpp"
 #include "conv_common.hpp"
 #include "host_col2im.hpp"
-//#include "device_col2im.hpp"
+#include "device_col2im_eb_nchw.hpp"

 int main(int argc, char* argv[])
 {
    using namespace ck;

 #if 1
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 1;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 1;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
+    constexpr index_t N  = 2;
+    constexpr index_t C  = 8;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 4;
+    constexpr index_t X  = 4;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
+    using RightPads = Sequence<2, 2>;
 #elif 0
    // 3x3, 34x34
    constexpr index_t N  = 64;
@@ -303,21 +303,22 @@ int main(int argc, char* argv[])
    using RightPads = Sequence<0, 3>;
 #endif

-    constexpr auto in_nchw_desc  = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
+    constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
    constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
    constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
-        in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
+        img_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});

    constexpr index_t HO = out_nkhw_desc.GetLengths()[2];
    constexpr index_t WO = out_nkhw_desc.GetLengths()[3];

-    auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});
+    constexpr auto col_eb_desc =
+        make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});

    using FilterSizes = Sequence<Y, X>;
    using OutputSizes = Sequence<HO, WO>;

-    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
-    ostream_ConstantTensorDescriptor(in_eb_desc, std::cout << "in_eb_desc: ");
+    ostream_ConstantTensorDescriptor(col_eb_desc, std::cout << "col_eb_desc: ");
+    ostream_ConstantTensorDescriptor(img_nchw_desc, std::cout << "img_nchw_desc: ");
    print_sequence("FilterSizes", FilterSizes{});
    print_sequence("OutputSizes", OutputSizes{});
    print_sequence("LeftPads", LeftPads{});
@@ -326,9 +327,9 @@ int main(int argc, char* argv[])
    print_sequence("ConvStrides", ConvStrides{});
    print_sequence("ConvDilations", ConvDilations{});

-    Tensor<float> in_eb(make_TensorDescriptor(in_eb_desc));
-    Tensor<float> in_nchw_host(make_TensorDescriptor(in_nchw_desc));
-    Tensor<float> in_nchw_device(make_TensorDescriptor(in_nchw_desc));
+    Tensor<float> col_eb(make_TensorDescriptor(col_eb_desc));
+    Tensor<float> img_nchw_host(make_TensorDescriptor(img_nchw_desc));
+    Tensor<float> img_nchw_device(make_TensorDescriptor(img_nchw_desc));

    std::size_t num_thread = std::thread::hardware_concurrency();

@@ -339,35 +340,35 @@ int main(int argc, char* argv[])
    }

    bool do_verification = atoi(argv[1]);
-    index_t nrepeat      = atoi(argv[2]);
+    std::size_t nrepeat  = atoi(argv[2]);

    if(do_verification)
    {
 #if 1
-        in_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #else
-        in_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
 #endif
    }

-#if 0
-    device_col2im(in_eb_desc,
-                  in_eb,
-                  in_nchw_desc,
-                  in_nchw_device,
-                  FilterSizes{},
-                  OutputSizes{},
-                  ConvStrides{},
-                  ConvDilations{},
-                  LeftPads{},
-                  RightPads{},
-                  nrepeat);
+#if 1
+    device_col2im_eb_nchw(col_eb_desc,
+                          col_eb,
+                          img_nchw_desc,
+                          img_nchw_device,
+                          FilterSizes{},
+                          OutputSizes{},
+                          ConvStrides{},
+                          ConvDilations{},
+                          LeftPads{},
+                          RightPads{},
+                          nrepeat);
 #endif

    if(do_verification)
    {
-        host_col2im(in_eb,
-                    in_nchw_host,
+        host_col2im(col_eb,
+                    img_nchw_host,
                    FilterSizes{},
                    OutputSizes{},
                    ConvStrides{},
@@ -375,12 +376,12 @@ int main(int argc, char* argv[])
                    LeftPads{},
                    RightPads{});

-        check_error(in_nchw_host, in_nchw_device);
+        check_error(img_nchw_host, img_nchw_device);

 #if 1
-        LogRange(std::cout << "in_eb : ", in_eb.mData, ",") << std::endl;
-        LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl;
-        LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl;
+        LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
+        LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
+        LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
 #endif
    }
 }
--- a/driver/src/col2im_driver.cu
+++ b/driver/src/col2im_driver.cu
@@ -13,26 +13,26 @@
 #include "device_tensor.hpp"
 #include "conv_common.hpp"
 #include "host_col2im.hpp"
-//#include "device_col2im.hpp"
+#include "device_col2im_eb_nchw.hpp"

 int main(int argc, char* argv[])
 {
    using namespace ck;

 #if 1
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 1;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 1;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
+    constexpr index_t N  = 2;
+    constexpr index_t C  = 8;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 4;
+    constexpr index_t X  = 4;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
+    using RightPads = Sequence<2, 2>;
 #elif 0
    // 3x3, 34x34
    constexpr index_t N  = 64;
@@ -303,21 +303,22 @@ int main(int argc, char* argv[])
    using RightPads = Sequence<0, 3>;
 #endif

-    constexpr auto in_nchw_desc  = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
+    constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
    constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
    constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
-        in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
+        img_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});

    constexpr index_t HO = out_nkhw_desc.GetLengths()[2];
    constexpr index_t WO = out_nkhw_desc.GetLengths()[3];

-    auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});
+    constexpr auto col_eb_desc =
+        make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});

    using FilterSizes = Sequence<Y, X>;
    using OutputSizes = Sequence<HO, WO>;

-    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
-    ostream_ConstantTensorDescriptor(in_eb_desc, std::cout << "in_eb_desc: ");
+    ostream_ConstantTensorDescriptor(col_eb_desc, std::cout << "col_eb_desc: ");
+    ostream_ConstantTensorDescriptor(img_nchw_desc, std::cout << "img_nchw_desc: ");
    print_sequence("FilterSizes", FilterSizes{});
    print_sequence("OutputSizes", OutputSizes{});
    print_sequence("LeftPads", LeftPads{});
@@ -326,9 +327,9 @@ int main(int argc, char* argv[])
    print_sequence("ConvStrides", ConvStrides{});
    print_sequence("ConvDilations", ConvDilations{});

-    Tensor<float> in_eb(make_TensorDescriptor(in_eb_desc));
-    Tensor<float> in_nchw_host(make_TensorDescriptor(in_nchw_desc));
-    Tensor<float> in_nchw_device(make_TensorDescriptor(in_nchw_desc));
+    Tensor<float> col_eb(make_TensorDescriptor(col_eb_desc));
+    Tensor<float> img_nchw_host(make_TensorDescriptor(img_nchw_desc));
+    Tensor<float> img_nchw_device(make_TensorDescriptor(img_nchw_desc));

    std::size_t num_thread = std::thread::hardware_concurrency();

@@ -339,35 +340,37 @@ int main(int argc, char* argv[])
    }

    bool do_verification = atoi(argv[1]);
-    index_t nrepeat      = atoi(argv[2]);
+    std::size_t nrepeat  = atoi(argv[2]);

    if(do_verification)
    {
-#if 1
-        in_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+#if 0
+        col_eb.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
+        img_nchw_device.GenerateTensorValue(GeneratorTensor_1{0}, num_thread);
 #else
-        in_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        img_nchw_device.GenerateTensorValue(GeneratorTensor_1{0}, num_thread);
 #endif
    }

-#if 0
-    device_col2im(in_eb_desc,
-                  in_eb,
-                  in_nchw_desc,
-                  in_nchw_device,
-                  FilterSizes{},
-                  OutputSizes{},
-                  ConvStrides{},
-                  ConvDilations{},
-                  LeftPads{},
-                  RightPads{},
-                  nrepeat);
+#if 1
+    device_col2im_eb_nchw(col_eb_desc,
+                          col_eb,
+                          img_nchw_desc,
+                          img_nchw_device,
+                          FilterSizes{},
+                          OutputSizes{},
+                          ConvStrides{},
+                          ConvDilations{},
+                          LeftPads{},
+                          RightPads{},
+                          nrepeat);
 #endif

    if(do_verification)
    {
-        host_col2im(in_eb,
-                    in_nchw_host,
+        host_col2im(col_eb,
+                    img_nchw_host,
                    FilterSizes{},
                    OutputSizes{},
                    ConvStrides{},
@@ -375,12 +378,12 @@ int main(int argc, char* argv[])
                    LeftPads{},
                    RightPads{});

-        check_error(in_nchw_host, in_nchw_device);
+        check_error(img_nchw_host, img_nchw_device);

-#if 1
-        LogRange(std::cout << "in_eb : ", in_eb.mData, ",") << std::endl;
-        LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl;
-        LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl;
+#if 0
+        LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
+        LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
+        LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
 #endif
    }
 }
--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -29,7 +29,7 @@ int main(int argc, char* argv[])
 {
    using namespace ck;

-#if 1
+#if 0
    constexpr index_t N  = 8;
    constexpr index_t C  = 32;
    constexpr index_t HI = 28;
@@ -393,7 +393,7 @@ int main(int argc, char* argv[])
 #elif 0
    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 1
+#elif 0
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
                                                                    in_nchw,
                                                                    wei_kcyx_desc,
@@ -403,7 +403,7 @@ int main(int argc, char* argv[])
                                                                    ConvStrides{},
                                                                    ConvDilations{},
                                                                    nrepeat);
-#elif 1
+#elif 0
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,
@@ -445,7 +445,7 @@ int main(int argc, char* argv[])
                                                                    ConvStrides{},
                                                                    ConvDilations{},
                                                                    nrepeat);
-#elif 0
+#elif 1
    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,