backward data (#7)

* enabled atomic add in tensor copy * added gridwise GEMM * added backward data conv using GEMM + atomic * added backward data conv using GEMM, no atomic

backward data (#7)
* enabled atomic add in tensor copy * added gridwise GEMM * added backward data conv using GEMM + atomic * added backward data conv using GEMM, no atomic
8f5f6496 · Chao Liu · GitHub · 31ded4ac · 8f5f6496 · 8f5f6496
Unverified Commit 8f5f6496 authored Dec 03, 2019 by Chao Liu Committed by GitHub Dec 03, 2019
20 changed files
--- a/composable_kernel/include/utility/config.nvidia.hpp.in
+++ b/composable_kernel/include/utility/config.nvidia.hpp.in
@@ -33,7 +33,15 @@ namespace ck {
 enum AddressSpace
 {
    generic,
-    global = generic
+    global,
+    lds,
+    vgpr
+};
+enum InMemoryDataOperation
+{
+    none,
+    atomic_add
 };
 #if CK_UNSIGNED_INDEX_TYPE

--- a/composable_kernel/include/utility/functional.hpp
+++ b/composable_kernel/include/utility/functional.hpp
@@ -64,9 +64,8 @@ struct static_if<true>
    }
    template <typename F>
-    __host__ __device__ static constexpr auto Else(F)
+    __host__ __device__ static void Else(F)
    {
-        return Type{};
    }
 };
@@ -82,14 +81,13 @@ struct static_if<false>
    }
    template <typename F>
-    __host__ __device__ static constexpr auto Else(F f)
+    __host__ __device__ static void Else(F f)
    {
        // This is a trick for compiler:
        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
        //   this will make "f" a generic lambda, so that "f" won't be compiled until being
        //   instantiated here
        f(forwarder{});
-        return Type{};
    }
 };

--- a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
+++ b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
+#ifndef CK_IN_MEMORY_OPERATION_AMD_HPP
+#define CK_IN_MEMORY_OPERATION_AMD_HPP
+#include "float_type.hpp"
+#include "amd_buffer_addressing.hpp"
+namespace ck {
+template <typename T,
+          index_t DataPerAccess,
+          AddressSpace SrcAddressSpace,
+          AddressSpace DstAddressSpace>
+__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+{
+    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+#if CK_USE_AMD_BUFFER_ADDRESSING
+    // TODO: use static_if::ElseIf, instead of nested static_if
+    static_if<SrcAddressSpace == AddressSpace::global && DstAddressSpace == vgpr>{}([&](auto) {
+        // buffer_load requires:
+        //   1) p_src must be in global memory space, d_dst must be vgpr
+        //   2) p_src to be a block-invariant pointer.
+        // It is user's responsibility to make sure that is true.
+        *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
+            amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
+    }).Else([&](auto) {
+        static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == global>{}([&](auto) {
+            // buffer_store requires:
+            //   1) p_src must be in vgpr space, d_dst must be global memory
+            //   2) p_dst to be a block-invariant pointer.
+            // It is user's responsibility to make sure that is true.
+            amd_intrinsic_buffer_store<T, DataPerAccess>(
+                *reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
+        }).Else([&](auto) {
+            *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
+                *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
+        });
+    });
+#else
+    *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
+        *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
+#endif
+}
+template <typename T,
+          index_t DataPerAccess,
+          AddressSpace SrcAddressSpace,
+          AddressSpace DstAddressSpace>
+__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+{
+    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+    static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
+        [&](auto) {
+            atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
+                      *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
+        })
+        .Else([&](auto fwd) {
+            static_assert(fwd(false), "atomic_add doesn't support this memory space");
+        });
+}
+template <typename T,
+          index_t DataPerAccess,
+          AddressSpace SrcAddressSpace,
+          AddressSpace DstAddressSpace,
+          InMemoryDataOperation DstInMemOp>
+__device__ void move_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+{
+    static_assert(DstInMemOp == InMemoryDataOperation::none ||
+                      DstInMemOp == InMemoryDataOperation::atomic_add,
+                  "wrong! InMemoryDataOperation not supported!");
+    // TODO: use static_if::ElseIf
+    static_if<DstInMemOp == InMemoryDataOperation::none>{}([&](auto) {
+        copy_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
+            p_src, src_offset, p_dst, dst_offset);
+    });
+    static_if<DstInMemOp == InMemoryDataOperation::atomic_add>{}([&](auto) {
+        atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
+            p_src, src_offset, p_dst, dst_offset);
+    });
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
+++ b/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
+#ifndef CK_IN_MEMORY_OPERATION_NVIDIA_HPP
+#define CK_IN_MEMORY_OPERATION_NVIDIA_HPP
+namespace ck {
+template <typename T,
+          index_t DataPerAccess,
+          AddressSpace SrcAddressSpace,
+          AddressSpace DstAddressSpace>
+__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+{
+    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+    *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
+        *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
+}
+template <typename T,
+          index_t DataPerAccess,
+          AddressSpace SrcAddressSpace,
+          AddressSpace DstAddressSpace>
+__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+{
+    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+    static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
+        [&](auto) {
+            atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
+                      *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
+        })
+        .Else([&](auto fwd) {
+            static_assert(fwd(false), "atomic_add doesn't support this memory space");
+        });
+}
+template <typename T,
+          index_t DataPerAccess,
+          AddressSpace SrcAddressSpace,
+          AddressSpace DstAddressSpace,
+          InMemoryDataOperation DstInMemOp>
+__device__ void move_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+{
+    static_assert(DstInMemOp == InMemoryDataOperation::none ||
+                      DstInMemOp == InMemoryDataOperation::atomic_add,
+                  "wrong! InMemoryDataOperation not supported!");
+    // TODO: use static_if::ElseIf
+    static_if<DstInMemOp == InMemoryDataOperation::none>{}([&](auto) {
+        copy_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
+            p_src, src_offset, p_dst, dst_offset);
+    });
+    static_if<DstInMemOp == InMemoryDataOperation::atomic_add>{}([&](auto) {
+        atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
+            p_src, src_offset, p_dst, dst_offset);
+    });
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -97,12 +97,57 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
    return x < y ? x : y;
 }
-// this is WRONG
+// highest common factor
-// TODO: implement least common multiple properly, instead of calling max()
+template <typename T>
-template <class T, class... Ts>
+__host__ __device__ constexpr T hcf(T x, T y)
-__host__ __device__ constexpr T lcm(T x, Ts... xs)
+{
+    if(x == 0)
+    {
+        return y;
+    }
+    if(y == 0)
+    {
+        return x;
+    }
+    if(x == y)
+    {
+        return x;
+    }
+    if(x > y)
+    {
+        return hcf(x - y, y);
+    }
+    return hcf(x, y - x);
+}
+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto hcf(Number<X>, Number<Y>)
+{
+    constexpr auto result = hcf(X, Y);
+    return Number<result>{};
+}
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto hcf(X x, Ys... ys)
+{
+    return hcf(x, ys...);
+}
+// least common multiple
+template <typename T>
+__host__ __device__ constexpr T lcm(T x, T y)
+{
+    return (x * y) / hcf(x, y);
+}
+template <typename X, typename Y, typename... Zs>
+__host__ __device__ constexpr auto lcm(X x, Y y, Zs... zs)
 {
-    return max(x, xs...);
+    return lcm(x, lcm(y, zs...));
 }
 template <class T>

--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -15,10 +15,18 @@ install(TARGETS host LIBRARY DESTINATION lib)
 if(DEVICE_BACKEND STREQUAL "AMD")
-    set(DRIVER_SOURCE src/driver.cpp)
+    set(CONV_SOURCE src/conv_driver.cpp)
+    set(COL2IM_SOURCE src/col2im_driver.cpp)
+    set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp)
 elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
-    set(DRIVER_SOURCE src/driver.cu)
+    set(CONV_SOURCE src/conv_driver.cu)
+    set(COL2IM_SOURCE src/col2im_driver.cu)
+    set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu)
 endif()
-add_executable(driver ${DRIVER_SOURCE}) 
+add_executable(conv ${CONV_SOURCE}) 
-target_link_libraries(driver PRIVATE host)
+add_executable(col2im ${COL2IM_SOURCE}) 
+add_executable(conv_bwd_data ${CONV_BWD_DATA_SOURCE}) 
+target_link_libraries(conv PRIVATE host)
+target_link_libraries(col2im PRIVATE host)
+target_link_libraries(conv_bwd_data PRIVATE host)
--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
@@ -2,10 +2,16 @@
 #define CONV_COMMON_HPP
 #include "ConstantTensorDescriptor_deprecated.hpp"
+#include "tensor_descriptor.hpp"
-// this is ugly, only for 4d
+template <class InDesc,
-template <class InDesc, class WeiDesc>
+          class WeiDesc,
-constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
+          class ConvStrides,
+          class ConvDilations,
+          class LowerPads,
+          class UpperPads>
+constexpr auto get_convolution_output_default_4d_tensor_descriptor_deprecated(
+    InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads)
 {
    using namespace ck;
@@ -22,18 +28,27 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDe
    static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
                  "input & weight dimension not consistent");
-    constexpr auto N  = in_desc.GetLength(I0);
+    constexpr index_t N  = in_desc.GetLength(I0);
-    constexpr auto HI = in_desc.GetLength(I2);
+    constexpr index_t Hi = in_desc.GetLength(I2);
-    constexpr auto WI = in_desc.GetLength(I3);
+    constexpr index_t Wi = in_desc.GetLength(I3);
+    constexpr index_t K = wei_desc.GetLength(I0);
+    constexpr index_t Y = wei_desc.GetLength(I2);
+    constexpr index_t X = wei_desc.GetLength(I3);
+    constexpr index_t HPadLow = LowerPads{}.Get(I0);
+    constexpr index_t WPadLow = LowerPads{}.Get(I1);
-    constexpr auto K = wei_desc.GetLength(I0);
+    constexpr index_t HPadUp = UpperPads{}.Get(I0);
-    constexpr auto Y = wei_desc.GetLength(I2);
+    constexpr index_t WPadUp = UpperPads{}.Get(I1);
-    constexpr auto X = wei_desc.GetLength(I3);
-    constexpr auto HO = HI + 1 - Y;
+    constexpr index_t YEff = (Y - 1) * ConvDilations{}[0] + 1;
-    constexpr auto WO = WI + 1 - X;
+    constexpr index_t XEff = (X - 1) * ConvDilations{}[1] + 1;
+    constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1;
+    constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1;
-    return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
+    return make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
 }
 template <class InDesc,
@@ -42,7 +57,7 @@ template <class InDesc,
          class ConvDilations,
          class LowerPads,
          class UpperPads>
-constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(
+constexpr auto get_convolution_output_default_4d_tensor_descriptor(
    InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads)
 {
    using namespace ck;
@@ -80,7 +95,7 @@ constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(
    constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1;
    constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1;
-    return make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
+    return make_native_tensor_descriptor_packed(Sequence<N, K, Ho, Wo>{});
 }
 template <class InDesc, class WeiDesc, class OutDesc>

--- a/driver/include/device_col2im_eb_nchw.hpp
+++ b/driver/include/device_col2im_eb_nchw.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "gridwise_col2im_eb_nchw.hpp"
+template <typename T,
+          typename ColDesc,
+          typename ImgDesc,
+          typename FilterSizes,
+          typename OutputSizes,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void device_col2im_eb_nchw(ColDesc,
+                           const Tensor<T>& col_eb,
+                           ImgDesc,
+                           Tensor<T>& img_nchw,
+                           FilterSizes,
+                           OutputSizes,
+                           ConvStrides,
+                           ConvDilations,
+                           LeftPads,
+                           RightPads,
+                           std::size_t nrepeat)
+{
+    using namespace ck;
+    constexpr auto col_eb_desc   = ColDesc{};
+    constexpr auto img_nchw_desc = ImgDesc{};
+    constexpr index_t N  = img_nchw_desc.GetLengths()[0];
+    constexpr index_t C  = img_nchw_desc.GetLengths()[1];
+    constexpr index_t Hi = img_nchw_desc.GetLengths()[2];
+    constexpr index_t Wi = img_nchw_desc.GetLengths()[3];
+    constexpr index_t E = col_eb_desc.GetLengths()[0];
+    constexpr index_t B = col_eb_desc.GetLengths()[1];
+    std::size_t data_sz = sizeof(T);
+    DeviceMem col_eb_device_buf(data_sz * col_eb.mDesc.GetElementSpace());
+    DeviceMem img_nchw_device_buf(data_sz * img_nchw.mDesc.GetElementSpace());
+    col_eb_device_buf.ToDevice(col_eb.mData.data());
+    img_nchw_device_buf.ToDevice(img_nchw.mData.data());
+#if 1
+    constexpr index_t BlockSize = 256;
+    constexpr index_t EPerBlock = 128;
+    constexpr index_t BPerBlock = 128;
+    using BlockCopySubLengths_E_B            = Sequence<8, 8>;
+    using BlockCopyClusterLengths_E_B        = Sequence<16, 16>;
+    using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using BlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using BlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+    constexpr index_t BlockCopyDataPerAccess_B = 1;
+#endif
+    constexpr index_t GridSize =
+        ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+    constexpr auto gridwise_col2im = GridwiseCol2Im_eb_nchw<GridSize,
+                                                            BlockSize,
+                                                            T,
+                                                            ColDesc,
+                                                            ImgDesc,
+                                                            FilterSizes,
+                                                            OutputSizes,
+                                                            ConvStrides,
+                                                            ConvDilations,
+                                                            LeftPads,
+                                                            RightPads,
+                                                            EPerBlock,
+                                                            BPerBlock,
+                                                            BlockCopySubLengths_E_B,
+                                                            BlockCopyClusterLengths_E_B,
+                                                            BlockCopyThreadClusterArrangeOrder,
+                                                            BlockCopySrcAccessOrder,
+                                                            BlockCopyDstAccessOrder,
+                                                            BlockCopyDataPerAccess_B>{};
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_col2im),
+                                                          const T* const __restrict__,
+                                                          T* const __restrict__>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   gridwise_col2im,
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(col_eb_device_buf.GetDeviceBuffer())),
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(img_nchw_device_buf.GetDeviceBuffer())));
+        printf("Elapsed time : %f ms\n", time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+    img_nchw_device_buf.FromDevice(img_nchw.mData.data());
+}
--- a/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
+template <typename T,
+          typename InDesc,
+          typename WeiDesc,
+          typename OutDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc in_nchw_desc,
+                                                                        Tensor<T>& in_nchw,
+                                                                        WeiDesc wei_kcyx_desc,
+                                                                        const Tensor<T>& wei_kcyx,
+                                                                        OutDesc out_nkhw_desc,
+                                                                        const Tensor<T>& out_nkhw,
+                                                                        ConvStrides,
+                                                                        ConvDilations,
+                                                                        LeftPads,
+                                                                        RightPads,
+                                                                        std::size_t nrepeat)
+{
+    using namespace ck;
+    constexpr index_t N  = out_nkhw_desc.GetLengths()[0];
+    constexpr index_t K  = out_nkhw_desc.GetLengths()[1];
+    constexpr index_t Ho = out_nkhw_desc.GetLengths()[2];
+    constexpr index_t Wo = out_nkhw_desc.GetLengths()[3];
+    constexpr index_t C = wei_kcyx_desc.GetLengths()[1];
+    constexpr index_t Y = wei_kcyx_desc.GetLengths()[2];
+    constexpr index_t X = wei_kcyx_desc.GetLengths()[3];
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+#if 1
+    // BlockSize = 256, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock              = 128;
+    constexpr index_t GemmNPerBlock              = 128;
+    constexpr index_t GemmKPerBlock              = 8;
+    constexpr index_t GemmMPerThreadSubC         = 4;
+    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMLevel0Cluster         = 4;
+    constexpr index_t GemmNLevel0Cluster         = 4;
+    constexpr index_t GemmMLevel1Cluster         = 4;
+    constexpr index_t GemmNLevel1Cluster         = 4;
+    constexpr index_t GemmKPerThreadLoop         = 1;
+    constexpr index_t GemmThreadGemmDataPerReadM = 4;
+    constexpr index_t GemmThreadGemmDataPerReadN = 4;
+    using GemmABlockCopySubLengths     = Sequence<1, 4>;  // Gemm-K, Gemm-M
+    using GemmABlockCopyClusterLengths = Sequence<8, 32>; // Gemm-K, Gemm-M
+    constexpr index_t GemmABlockCopyDataPerAccess = 4; // Gemm-M
+    using GemmBBlockCopySubLengths     = Sequence<4, 1>;   // Gemm-K, Gemm-N
+    using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N
+    constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N
+    constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N
+#endif
+    constexpr index_t GemmM = C * Y * X;
+    constexpr index_t GemmN = N * Ho * Wo;
+    constexpr index_t GridSize = ((GemmM + GemmMPerBlock - 1) / GemmMPerBlock) *
+                                 ((GemmN + GemmNPerBlock - 1) / GemmNPerBlock);
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+    constexpr auto gridwise_conv = GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw<
+        GridSize,
+        BlockSize,
+        T,
+        T,
+        decltype(in_nchw_desc),
+        decltype(wei_kcyx_desc),
+        decltype(out_nkhw_desc),
+        ConvStrides,
+        ConvDilations,
+        LeftPads,
+        RightPads,
+        GemmMPerBlock,
+        GemmNPerBlock,
+        GemmKPerBlock,
+        GemmMPerThreadSubC,
+        GemmNPerThreadSubC,
+        GemmMLevel0Cluster,
+        GemmNLevel0Cluster,
+        GemmMLevel1Cluster,
+        GemmNLevel1Cluster,
+        GemmKPerThreadLoop,
+        GemmThreadGemmDataPerReadM,
+        GemmThreadGemmDataPerReadN,
+        GemmABlockCopySubLengths,
+        GemmABlockCopyClusterLengths,
+        GemmABlockCopyDataPerAccess,
+        GemmBBlockCopySubLengths,
+        GemmBBlockCopyClusterLengths,
+        GemmBBlockCopyDataPerAccess,
+        GemmCThreadCopyDataPerAccess>{};
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
+                                                          T* const __restrict__,
+                                                          const T* const __restrict__,
+                                                          const T* const __restrict__>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   gridwise_conv,
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+    in_nchw_device_buf.FromDevice(in_nchw.mData.data());
+}
--- a/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+template <typename T,
+          typename InDesc,
+          typename WeiDesc,
+          typename OutDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw(InDesc in_nchw_desc,
+                                                                        Tensor<T>& in_nchw,
+                                                                        WeiDesc wei_kcyx_desc,
+                                                                        const Tensor<T>& wei_kcyx,
+                                                                        OutDesc out_nkhw_desc,
+                                                                        const Tensor<T>& out_nkhw,
+                                                                        ConvStrides,
+                                                                        ConvDilations,
+                                                                        LeftPads,
+                                                                        RightPads,
+                                                                        std::size_t nrepeat)
+{
+    using namespace ck;
+    constexpr index_t N  = out_nkhw_desc.GetLengths()[0];
+    constexpr index_t K  = out_nkhw_desc.GetLengths()[1];
+    constexpr index_t Ho = out_nkhw_desc.GetLengths()[2];
+    constexpr index_t Wo = out_nkhw_desc.GetLengths()[3];
+    constexpr index_t C = wei_kcyx_desc.GetLengths()[1];
+    constexpr index_t Y = wei_kcyx_desc.GetLengths()[2];
+    constexpr index_t X = wei_kcyx_desc.GetLengths()[3];
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+#if 1
+    // BlockSize = 256, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+    constexpr index_t BPerBlock = 32;
+    constexpr index_t EPerBlock = 32;
+    constexpr index_t KPerBlock = 8;
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+    using OutBlockCopySubLengths_K_B_N0     = Sequence<1, 1, 4>;
+    using OutBlockCopyClusterLengths_K_B_N0 = Sequence<8, 32, 1>;
+    constexpr index_t OutBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t OutBlockCopyDstDataPerWrite_N0 = 4;
+    using WeiBlockCopySubLengths_K_E_C0     = Sequence<1, 4, 1>;
+    using WeiBlockCopyClusterLengths_K_E_C0 = Sequence<8, 8, 4>;
+    constexpr index_t WeiBlockCopySrcDataPerRead_E   = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_C0 = 1;
+    constexpr index_t InThreadCopyDstDataPerWrite_B = 1;
+#endif
+    constexpr index_t C0 = GemmMPerThreadSubC;
+    constexpr index_t N0 = GemmNPerThreadSubC;
+    constexpr index_t C1 = C / C0;
+    constexpr index_t N1 = N / N0;
+    constexpr index_t E = C1 * Y * X;
+    constexpr index_t B = (N1 * Ho * Wo);
+    constexpr index_t GridSize =
+        ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+    constexpr auto gridwise_conv =
+        GridwiseConvolutionBackwardDataImplicitGemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            T,
+            T,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            LeftPads,
+            RightPads,
+            EPerBlock,
+            BPerBlock,
+            KPerBlock,
+            GemmMPerThreadSubC,
+            GemmNPerThreadSubC,
+            GemmMLevel0Cluster,
+            GemmNLevel0Cluster,
+            GemmMLevel1Cluster,
+            GemmNLevel1Cluster,
+            GemmKPerThreadLoop,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            OutBlockCopySubLengths_K_B_N0,
+            OutBlockCopyClusterLengths_K_B_N0,
+            OutBlockCopySrcDataPerRead_B,
+            OutBlockCopyDstDataPerWrite_N0,
+            WeiBlockCopySubLengths_K_E_C0,
+            WeiBlockCopyClusterLengths_K_E_C0,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_C0,
+            InThreadCopyDstDataPerWrite_B>{};
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
+                                                          T* const __restrict__,
+                                                          const T* const __restrict__,
+                                                          const T* const __restrict__>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   gridwise_conv,
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+    in_nchw_device_buf.FromDevice(in_nchw.mData.data());
+}
--- a/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp"
+template <typename T,
+          typename InDesc,
+          typename WeiDesc,
+          typename OutDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc in_nchw_desc,
+                                                                        Tensor<T>& in_nchw,
+                                                                        WeiDesc wei_kcyx_desc,
+                                                                        const Tensor<T>& wei_kcyx,
+                                                                        OutDesc out_nkhw_desc,
+                                                                        const Tensor<T>& out_nkhw,
+                                                                        ConvStrides,
+                                                                        ConvDilations,
+                                                                        LeftPads,
+                                                                        RightPads,
+                                                                        std::size_t nrepeat)
+{
+    using namespace ck;
+    constexpr index_t N  = out_nkhw_desc.GetLengths()[0];
+    constexpr index_t K  = out_nkhw_desc.GetLengths()[1];
+    constexpr index_t Ho = out_nkhw_desc.GetLengths()[2];
+    constexpr index_t Wo = out_nkhw_desc.GetLengths()[3];
+    constexpr index_t C = wei_kcyx_desc.GetLengths()[1];
+    constexpr index_t Y = wei_kcyx_desc.GetLengths()[2];
+    constexpr index_t X = wei_kcyx_desc.GetLengths()[3];
+    constexpr index_t ConvStrideH = ConvStrides{}[0];
+    constexpr index_t ConvStrideW = ConvStrides{}[1];
+    constexpr index_t ConvDilationH = ConvDilations{}[0];
+    constexpr index_t ConvDilationW = ConvDilations{}[1];
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+#if 1
+    // BlockSize = 256, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock              = 128;
+    constexpr index_t GemmNPerBlock              = 128;
+    constexpr index_t GemmKPerBlock              = 8;
+    constexpr index_t GemmMPerThreadSubC         = 4;
+    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMLevel0Cluster         = 4;
+    constexpr index_t GemmNLevel0Cluster         = 4;
+    constexpr index_t GemmMLevel1Cluster         = 4;
+    constexpr index_t GemmNLevel1Cluster         = 4;
+    constexpr index_t GemmKPerThreadLoop         = 1;
+    constexpr index_t GemmThreadGemmDataPerReadM = 4;
+    constexpr index_t GemmThreadGemmDataPerReadN = 4;
+    using GemmABlockCopySubLengths     = Sequence<4, 1>;   // Gemm-K, Gemm-M
+    using GemmABlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-M
+    constexpr index_t GemmABlockCopyDataPerAccess = 1; // Gemm-M
+    using GemmBBlockCopySubLengths     = Sequence<4, 1>;   // Gemm-K, Gemm-N
+    using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N
+    constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N
+    constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N
+#elif 0
+    // BlockSize = 256, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock              = 128;
+    constexpr index_t GemmNPerBlock              = 128;
+    constexpr index_t GemmKPerBlock              = 8;
+    constexpr index_t GemmMPerThreadSubC         = 4;
+    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMLevel0Cluster         = 4;
+    constexpr index_t GemmNLevel0Cluster         = 4;
+    constexpr index_t GemmMLevel1Cluster         = 4;
+    constexpr index_t GemmNLevel1Cluster         = 4;
+    constexpr index_t GemmKPerThreadLoop         = 1;
+    constexpr index_t GemmThreadGemmDataPerReadM = 4;
+    constexpr index_t GemmThreadGemmDataPerReadN = 4;
+    using GemmABlockCopySubLengths     = Sequence<1, 4>;  // Gemm-K, Gemm-M
+    using GemmABlockCopyClusterLengths = Sequence<8, 32>; // Gemm-K, Gemm-M
+    constexpr index_t GemmABlockCopyDataPerAccess = 4; // Gemm-M
+    using GemmBBlockCopySubLengths     = Sequence<4, 1>;   // Gemm-K, Gemm-N
+    using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N
+    constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N
+    constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N
+#endif
+    // TODO: this algo support any stride and dilation. But for now, let's fix them to be 1 for
+    // simplicity
+    constexpr index_t hcf_stride_dilation_h = math::hcf(ConvStrideH, ConvDilationH);
+    constexpr index_t hcf_stride_dilation_w = math::hcf(ConvStrideW, ConvDilationW);
+    constexpr index_t Ytilda = ConvStrideH / hcf_stride_dilation_h; // may be wrong
+    constexpr index_t Xtilda = ConvStrideW / hcf_stride_dilation_w; // may be wrong
+    constexpr index_t Ydot = math::integer_divide_ceil(Y, Ytilda);
+    constexpr index_t Xdot = math::integer_divide_ceil(X, Xtilda);
+    constexpr index_t right_pad_ho = (ConvDilationH / hcf_stride_dilation_h) * (Y - Ytilda);
+    constexpr index_t right_pad_wo = (ConvDilationW / hcf_stride_dilation_w) * (X - Xtilda);
+    constexpr index_t Htilda = Ho + right_pad_ho;
+    constexpr index_t Wtilda = Wo + right_pad_wo;
+    constexpr index_t GemmK = K * Ydot * Xdot;
+    constexpr index_t GemmM = C * Ytilda * Xtilda;
+    constexpr index_t GemmN = N * Htilda * Wtilda;
+    constexpr index_t GridSize = ((GemmM + GemmMPerBlock - 1) / GemmMPerBlock) *
+                                 ((GemmN + GemmNPerBlock - 1) / GemmNPerBlock);
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+    constexpr auto gridwise_conv = GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw<
+        GridSize,
+        BlockSize,
+        T,
+        T,
+        decltype(in_nchw_desc),
+        decltype(wei_kcyx_desc),
+        decltype(out_nkhw_desc),
+        ConvStrides,
+        ConvDilations,
+        LeftPads,
+        RightPads,
+        GemmMPerBlock,
+        GemmNPerBlock,
+        GemmKPerBlock,
+        GemmMPerThreadSubC,
+        GemmNPerThreadSubC,
+        GemmMLevel0Cluster,
+        GemmNLevel0Cluster,
+        GemmMLevel1Cluster,
+        GemmNLevel1Cluster,
+        GemmKPerThreadLoop,
+        GemmThreadGemmDataPerReadM,
+        GemmThreadGemmDataPerReadN,
+        GemmABlockCopySubLengths,
+        GemmABlockCopyClusterLengths,
+        GemmABlockCopyDataPerAccess,
+        GemmBBlockCopySubLengths,
+        GemmBBlockCopyClusterLengths,
+        GemmBBlockCopyDataPerAccess,
+        GemmCThreadCopyDataPerAccess>{};
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
+                                                          T* const __restrict__,
+                                                          const T* const __restrict__,
+                                                          const T* const __restrict__>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   gridwise_conv,
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+    in_nchw_device_buf.FromDevice(in_nchw.mData.data());
+}
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -2,7 +2,7 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_operation_wrapper.hpp"
 #include "convolution_common.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
@@ -54,8 +54,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-#if 1
+#if 0
-    // BlockSize = 256, each thread hold 64 data
+    // BlockSize = 256, EperBlock = 8, each thread hold 64 data
    constexpr index_t BlockSize = 256;
    constexpr index_t BPerBlock = 16;
@@ -89,6 +89,43 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#elif 1
+    // BlockSize = 256, EPerBlock = 16, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+    constexpr index_t BPerBlock = 16;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 16;
+    constexpr index_t GemmNRepeat = 2;
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 2, 1, 4>;
+    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<16, 1, 16, 1>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 2, 1, 3>; // [E, B, N1, N2]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 2>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 64>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #elif 0
@@ -221,13 +258,20 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    for(index_t i = 0; i < nrepeat; ++i)
    {
-        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
+                                                          const T* const __restrict__,
+                                                          const T* const __restrict__,
+                                                          T* const __restrict__>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                                   gridwise_conv,
-                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                                   const_cast<const T* const __restrict__>(
-                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+                                       static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
        printf("Elapsed time : %f ms, %f TFlop/s\n",
               time,

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
@@ -46,7 +46,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-#if 1
+#if 0
    // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
    constexpr index_t BlockSize = 256;
@@ -120,7 +120,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 1
+#elif 0
    // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
    constexpr index_t BlockSize = 256;
@@ -157,6 +157,42 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 2;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#elif 1
+    constexpr index_t BlockSize = 64;
+    constexpr index_t BPerBlock = 16;
+    constexpr index_t KPerBlock = 32;
+    constexpr index_t EPerBlock = 4;
+    constexpr index_t GemmNRepeat = 2;
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 1;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 2, 1, 4>;
+    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<4, 1, 16, 1>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 2, 1, 3>; // [E, B, N1, N2]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+    using WeiBlockCopySubLengths_E_K            = Sequence<1, 2>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 16>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 2;
 #endif
    constexpr index_t N1 = GemmNRepeat;

--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -51,6 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
 #if 1
+    // BlockSize = 256, EPerBlock = 8
    constexpr index_t BlockSize = 256;
    constexpr index_t BPerBlock = 128;
@@ -85,7 +86,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
    constexpr index_t OutThreadCopyDataPerAccess_B = 1;
-#elif 1
+#elif 0
+    // BlockSize = 256, EPerBlock = 8
    // 1x1 filter, 8x8 image
    constexpr index_t BlockSize = 256;
@@ -122,6 +124,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t OutThreadCopyDataPerAccess_B = 4;
 #elif 0
+    // BlockSize = 256, EPerBlock = 16
+    // 1x1 filter, 8x8 image
+    constexpr index_t BlockSize = 256;
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 16;
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+    using InBlockCopySubLengths_E_B            = Sequence<2, 4>;
+    using InBlockCopyClusterLengths_E_B        = Sequence<8, 32>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+    constexpr index_t InBlockCopyDataPerAccess_B = 4;
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 2>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 64>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+    constexpr index_t OutThreadCopyDataPerAccess_B = 4;
+#elif 1
    // 1x1 filter, 14x14 image
    constexpr index_t BlockSize = 256;
@@ -167,47 +206,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
    constexpr auto gridwise_conv =
-#if 0
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer<
-        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
+            GridSize,
-#else
+            BlockSize,
-        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
+            T,
-#endif
+            decltype(in_nchw_desc),
-        <GridSize,
+            decltype(wei_kcyx_desc),
-         BlockSize,
+            decltype(out_nkhw_desc),
-         T,
+            ConvStrides,
-         decltype(in_nchw_desc),
+            ConvDilations,
-         decltype(wei_kcyx_desc),
+            LeftPads,
-         decltype(out_nkhw_desc),
+            RightPads,
-         ConvStrides,
+            BPerBlock,
-         ConvDilations,
+            KPerBlock,
-         LeftPads,
+            EPerBlock,
-         RightPads,
+            GemmMPerThreadSubC,
-         BPerBlock,
+            GemmNPerThreadSubC,
-         KPerBlock,
+            GemmMLevel0Cluster,
-         EPerBlock,
+            GemmNLevel0Cluster,
-         GemmMPerThreadSubC,
+            GemmMLevel1Cluster,
-         GemmNPerThreadSubC,
+            GemmNLevel1Cluster,
-         GemmMLevel0Cluster,
+            GemmKPerThreadLoop,
-         GemmNLevel0Cluster,
+            GemmDataPerReadA,
-         GemmMLevel1Cluster,
+            GemmDataPerReadB,
-         GemmNLevel1Cluster,
+            InBlockCopySubLengths_E_B,
-         GemmKPerThreadLoop,
+            InBlockCopyClusterLengths_E_B,
-         GemmDataPerReadA,
+            InBlockCopyThreadClusterArrangeOrder,
-         GemmDataPerReadB,
+            InBlockCopySrcAccessOrder,
-         InBlockCopySubLengths_E_B,
+            InBlockCopyDstAccessOrder,
-         InBlockCopyClusterLengths_E_B,
+            InBlockCopyDataPerAccess_B,
-         InBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySubLengths_E_K,
-         InBlockCopySrcAccessOrder,
+            WeiBlockCopyClusterLengths_E_K,
-         InBlockCopyDstAccessOrder,
+            WeiBlockCopyThreadClusterArrangeOrder,
-         InBlockCopyDataPerAccess_B,
+            WeiBlockCopySrcAccessOrder,
-         WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyDstAccessOrder,
-         WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopySrcDataPerRead_E,
-         WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopyDstDataPerWrite_K,
-         WeiBlockCopySrcAccessOrder,
+            OutThreadCopyDataPerAccess_B>{};
-         WeiBlockCopyDstAccessOrder,
-         WeiBlockCopySrcDataPerRead_E,
-         WeiBlockCopyDstDataPerWrite_K,
-         OutThreadCopyDataPerAccess_B>{};
    for(index_t i = 0; i < nrepeat; ++i)
    {

--- a/driver/include/device_tensor.hpp
+++ b/driver/include/device_tensor.hpp
+#pragma once
+#include "tensor.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "tensor_descriptor.hpp"
+template <typename ConstTensorDesc, std::size_t... Is>
+auto make_TensorDescriptor_impl(ConstTensorDesc, std::integer_sequence<std::size_t, Is...>)
+{
+    std::initializer_list<std::size_t> lengths = {ConstTensorDesc::GetLengths()[Is]...};
+    std::initializer_list<std::size_t> strides = {ConstTensorDesc::GetStrides()[Is]...};
+    return TensorDescriptor(lengths, strides);
+}
+template <typename ConstTensorDesc>
+auto make_TensorDescriptor(ConstTensorDesc)
+{
+    return make_TensorDescriptor_impl(
+        ConstTensorDesc{},
+        std::make_integer_sequence<std::size_t, ConstTensorDesc::GetNumOfDimension()>{});
+}
+template <typename ConstTensorDesc>
+void ostream_ConstantTensorDescriptor(ConstTensorDesc, std::ostream& os = std::cout)
+{
+    ostream_TensorDescriptor(make_TensorDescriptor(ConstTensorDesc{}), os);
+}
--- a/driver/include/host_col2im.hpp
+++ b/driver/include/host_col2im.hpp
+#pragma once
+#include "tensor.hpp"
+template <typename T,
+          typename FilterSizes,
+          typename OutputSizes,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void host_col2im(const Tensor<T>& in_eb,
+                 Tensor<T>& in_nchw,
+                 FilterSizes,
+                 OutputSizes,
+                 ConvStrides,
+                 ConvDilations,
+                 LeftPads,
+                 RightPads)
+{
+    using namespace ck;
+    int N  = in_nchw.mDesc.GetLengths()[0];
+    int C  = in_nchw.mDesc.GetLengths()[1];
+    int HI = in_nchw.mDesc.GetLengths()[2];
+    int WI = in_nchw.mDesc.GetLengths()[3];
+    int Y = FilterSizes{}[0];
+    int X = FilterSizes{}[1];
+    int HO = OutputSizes{}[0];
+    int WO = OutputSizes{}[1];
+    auto f = [&](auto n, auto c, auto hi, auto wi) {
+        double v = 0;
+        for(int y = 0; y < Y; ++y)
+        {
+            int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
+            if(h_tmp >= 0 && h_tmp < HI && h_tmp % ConvStrides{}[0] == 0)
+            {
+                int ho = h_tmp / ConvStrides{}[0];
+                for(int x = 0; x < X; ++x)
+                {
+                    int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
+                    if(w_tmp >= 0 && w_tmp < WI && w_tmp % ConvStrides{}[1] == 0)
+                    {
+                        int wo = w_tmp / ConvStrides{}[1];
+                        int e = c * (Y * X) + y * X + x;
+                        int b = n * (HO * WO) + ho * WO + wo;
+                        v += in_eb(e, b);
+                    }
+                }
+            }
+        }
+        in_nchw(n, c, hi, wi) = v;
+    };
+    auto f_par = make_ParallelTensorFunctor(f,
+                                            in_nchw.mDesc.GetLengths()[0],
+                                            in_nchw.mDesc.GetLengths()[1],
+                                            in_nchw.mDesc.GetLengths()[2],
+                                            in_nchw.mDesc.GetLengths()[3]);
+    f_par(std::thread::hardware_concurrency());
+}
--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
 #pragma once
 #include "tensor.hpp"
-#include "common_header.hpp"
-#include "ConstantTensorDescriptor_deprecated.hpp"
-// this is ugly, only for 4d
-template <class TConstTensorDesc>
-void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::cout)
-{
-    using namespace ck;
-    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
-    constexpr auto I0   = Number<0>{};
-    constexpr auto I1   = Number<1>{};
-    constexpr auto I2   = Number<2>{};
-    constexpr auto I3   = Number<3>{};
-    constexpr auto desc = TConstTensorDesc{};
-    os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", "
-       << desc.GetLength(I2) << ", " << desc.GetLength(I3) << "}, "
-       << "Strides: {" << desc.GetStride(I0) << ", " << desc.GetStride(I1) << ", "
-       << desc.GetStride(I2) << ", " << desc.GetStride(I3) << "}" << std::endl;
-}
-// this is ugly, only for 4d
-template <class TConstTensorDesc>
-auto make_TensorDescriptor(TConstTensorDesc)
-{
-    using namespace ck;
-    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
-    constexpr auto I0   = Number<0>{};
-    constexpr auto I1   = Number<1>{};
-    constexpr auto I2   = Number<2>{};
-    constexpr auto I3   = Number<3>{};
-    constexpr auto desc = TConstTensorDesc{};
-    std::initializer_list<index_t> lengths = {
-        desc.GetLength(I0), desc.GetLength(I1), desc.GetLength(I2), desc.GetLength(I3)};
-    std::initializer_list<index_t> strides = {
-        desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)};
-    return TensorDescriptor(lengths, strides);
-}
 template <class TIn,
          class TWei,
@@ -331,25 +287,3 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
    make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread);
    make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread);
 }
-template <class T>
-void check_error(const Tensor<T>& ref, const Tensor<T>& result)
-{
-    float error     = 0;
-    float max_diff  = -1;
-    float ref_value = 0, result_value = 0;
-    for(int i = 0; i < ref.mData.size(); ++i)
-    {
-        error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        if(max_diff < diff)
-        {
-            max_diff     = diff;
-            ref_value    = ref.mData[i];
-            result_value = result.mData[i];
-        }
-    }
-    std::cout << "error: " << error << std::endl;
-    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
-}
--- a/driver/include/host_conv_bwd_data.hpp
+++ b/driver/include/host_conv_bwd_data.hpp
+#pragma once
+#include "tensor.hpp"
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void host_direct_convolution_backward_data(Tensor<TIn>& in_nchw,
+                                           const Tensor<TWei>& wei_kcyx,
+                                           const Tensor<TOut>& out_nkhw,
+                                           ConvStrides,
+                                           ConvDilations,
+                                           LeftPads,
+                                           RightPads)
+{
+    using namespace ck;
+    int N  = in_nchw.mDesc.GetLengths()[0];
+    int C  = in_nchw.mDesc.GetLengths()[1];
+    int HI = in_nchw.mDesc.GetLengths()[2];
+    int WI = in_nchw.mDesc.GetLengths()[3];
+    std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
+    std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
+    std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
+    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
+    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
+    auto f = [&](auto n, auto c, auto hi, auto wi) {
+        double v = 0;
+        for(int y = 0; y < Y; ++y)
+        {
+            int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
+            if(h_tmp % ConvStrides{}[0] == 0)
+            {
+                int ho = h_tmp / ConvStrides{}[0];
+                if(ho >= 0 && ho < HO)
+                {
+                    for(int x = 0; x < X; ++x)
+                    {
+                        int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
+                        if(w_tmp % ConvStrides{}[1] == 0)
+                        {
+                            int wo = w_tmp / ConvStrides{}[1];
+                            if(wo >= 0 && wo < WO)
+                            {
+                                for(int k = 0; k < K; ++k)
+                                {
+                                    v += out_nkhw(n, k, ho, wo) * wei_kcyx(k, c, y, x);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        in_nchw(n, c, hi, wi) = v;
+    };
+    auto f_par = make_ParallelTensorFunctor(f,
+                                            in_nchw.mDesc.GetLengths()[0],
+                                            in_nchw.mDesc.GetLengths()[1],
+                                            in_nchw.mDesc.GetLengths()[2],
+                                            in_nchw.mDesc.GetLengths()[3]);
+    f_par(std::thread::hardware_concurrency());
+}
--- a/driver/include/tensor.hpp
+++ b/driver/include/tensor.hpp
@@ -68,10 +68,12 @@ auto construct_f_unpack_args(F, T args)
 struct TensorDescriptor
 {
    TensorDescriptor() = delete;
-    TensorDescriptor(std::initializer_list<std::size_t> lens);
-    TensorDescriptor(std::initializer_list<std::size_t> lens,
+    template <typename X>
-                     std::initializer_list<std::size_t> strides);
+    TensorDescriptor(std::vector<X> lens);
-    TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides);
+    template <typename X, typename Y>
+    TensorDescriptor(std::vector<X> lens, std::vector<Y> strides);
    void CalculateStrides();
@@ -269,4 +271,39 @@ struct Tensor
    std::vector<T> mData;
 };
+void ostream_TensorDescriptor(const TensorDescriptor& desc, std::ostream& os = std::cout)
+{
+    os << "dim " << desc.GetNumOfDimension() << ", ";
+    os << "lengths {";
+    LogRange(os, desc.GetLengths(), ", ");
+    os << "}, ";
+    os << "strides {";
+    LogRange(os, desc.GetStrides(), ", ");
+    os << "}" << std::endl;
+}
+template <class T>
+void check_error(const Tensor<T>& ref, const Tensor<T>& result)
+{
+    float error     = 0;
+    float max_diff  = -1;
+    float ref_value = 0, result_value = 0;
+    for(int i = 0; i < ref.mData.size(); ++i)
+    {
+        error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        if(max_diff < diff)
+        {
+            max_diff     = diff;
+            ref_value    = ref.mData[i];
+            result_value = result.mData[i];
+        }
+    }
+    std::cout << "error: " << error << std::endl;
+    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
+}
 #endif
--- a/driver/include/tensor_generator.hpp
+++ b/driver/include/tensor_generator.hpp
+#ifndef TENSOR_GENERATOR_HPP
+#define TENSOR_GENERATOR_HPP
+#include "config.hpp"
+struct GeneratorTensor_1
+{
+    int value = 1;
+    template <class... Is>
+    double operator()(Is... is)
+    {
+        return value;
+    }
+};
+struct GeneratorTensor_2
+{
+    int min_value = 0;
+    int max_value = 1;
+    template <class... Is>
+    double operator()(Is...)
+    {
+        return (std::rand() % (max_value - min_value)) + min_value;
+    }
+};
+struct GeneratorTensor_3
+{
+    template <class... Is>
+    double operator()(Is... is)
+    {
+        std::array<ck::index_t, sizeof...(Is)> dims = {{static_cast<ck::index_t>(is)...}};
+        auto f_acc = [](auto a, auto b) { return 10 * a + b; };
+        return std::accumulate(dims.begin(), dims.end(), ck::index_t(0), f_acc);
+    }
+};
+struct GeneratorTensor_Checkboard
+{
+    template <class... Ts>
+    double operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {{Xs...}};
+        return std::accumulate(dims.begin(),
+                               dims.end(),
+                               true,
+                               [](bool init, ck::index_t x) -> int { return init != (x % 2); })
+                   ? 1
+                   : -1;
+    }
+};
+#endif