Code clean up (#20)

* tuning para, * testing on v100 * add fp16 * remove deprecated tensor descriptor * sync with miopen * update build script Co-authored-by: Jing Zhang <jizhan@amd.com>

Code clean up (#20)
* tuning para, * testing on v100 * add fp16 * remove deprecated tensor descriptor * sync with miopen * update build script Co-authored-by: Jing Zhang <jizhan@amd.com>
5c7cec11 · Chao Liu · GitHub · 7d09790a · 5c7cec11 · 5c7cec11
Unverified Commit 5c7cec11 authored Jun 23, 2020 by Chao Liu Committed by GitHub Jun 23, 2020
20 changed files
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -25,11 +25,7 @@
 #define CK_USE_AMD_BUFFER_ADDRESSING 1
 #endif

-#ifndef CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
-#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1
-#endif
-
-// only support gfx908
+// only gfx908 support native floating point atomic add
 #ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD 0
 #endif
@@ -47,6 +43,11 @@
 #define CK_USE_AMD_XDLOPS_EMULATE 0 // For internal debug purposes
 #endif

+// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
+#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
+#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
+#endif
+
 // experimental implementation
 #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
 #define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
@@ -54,8 +55,24 @@
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
+
+#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
+#endif
+
+#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK
 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
+#endif
+
+// workaround: put all workaround here
+// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
+#ifndef CK_WORKAROUND_SWDEV_229564
+#define CK_WORKAROUND_SWDEV_229564 1
+#endif
+// workaround for buffer load/store fp16/bfp16 intrinsic bug
+#ifndef CK_WORKAROUND_SWDEV_231101
+#define CK_WORKAROUND_SWDEV_231101 1
+#endif

 namespace ck {


--- a/composable_kernel/include/utility/config.nvidia.hpp.in
+++ b/composable_kernel/include/utility/config.nvidia.hpp.in
 #ifndef CK_CONFIG_NVIDIA_HPP
 #define CK_CONFIG_NVIDIA_HPP

-#include "cuda_runtime.h"
-#include "cuda_fp16.h"
-#include "nvToolsExt.h"
-#include "helper_cuda.h"
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <nvToolsExt.h>

 // index type: unsigned or signed
 #define CK_UNSIGNED_INDEX_TYPE 0
@@ -19,6 +18,7 @@
 #define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 0
 #define CK_USE_AMD_XDLOPS 0
 #define CK_USE_AMD_XDLOPS_INLINE_ASM 0
+#define CK_USE_AMD_XDLOPS_EMULATE 0

 // experimental implementation
 #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 0
@@ -32,16 +32,16 @@ namespace ck {

 enum AddressSpace
 {
-    generic,
-    global,
-    lds,
-    vgpr
+    Generic,
+    Global,
+    Lds,
+    Vgpr
 };

 enum InMemoryDataOperation
 {
-    none,
-    atomic_add
+    Set,
+    AtomicAdd
 };

 #if CK_UNSIGNED_INDEX_TYPE

--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
@@ -11,12 +11,15 @@ typedef float float16_t __attribute__((ext_vector_type(16)));
 typedef float float32_t __attribute__((ext_vector_type(32)));

 // float16
+typedef _Float16 half_t;
 typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
 typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
+typedef _Float16 half8_t __attribute__((ext_vector_type(8)));

 // bfloat16
 typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
 typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
+typedef ushort ushort8_t __attribute__((ext_vector_type(8)));

 template <class T, index_t N>
 struct vector_type
@@ -83,37 +86,37 @@ struct vector_type<float, 4>
 };

 template <>
-struct vector_type<half, 1>
+struct vector_type<half_t, 1>
 {
-    using MemoryType = half;
+    using MemoryType = half_t;

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
    {
        static_assert(I < 1, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
+        *(reinterpret_cast<half_t*>(&v) + I) = s;
    }
 };

 template <>
-struct vector_type<half, 2>
+struct vector_type<half_t, 2>
 {
    using MemoryType = half2_t;

    union DataType
    {
        MemoryType vector;
-        half scalar[2];
+        half_t scalar[2];
    };

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
    {
        static_assert(I < 2, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
+        *(reinterpret_cast<half_t*>(&v) + I) = s;
    }

-    __host__ __device__ static MemoryType Pack(half s0, half s1)
+    __host__ __device__ static MemoryType Pack(half_t s0, half_t s1)
    {
        DataType data;
        data.scalar[0] = s0;
@@ -123,24 +126,24 @@ struct vector_type<half, 2>
 };

 template <>
-struct vector_type<half, 4>
+struct vector_type<half_t, 4>
 {
    using MemoryType = half4_t;

    union DataType
    {
        MemoryType vector;
-        half scalar[4];
+        half_t scalar[4];
    };

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
    {
        static_assert(I < 4, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
+        *(reinterpret_cast<half_t*>(&v) + I) = s;
    }

-    __host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3)
+    __host__ __device__ static MemoryType Pack(half_t s0, half_t s1, half_t s2, half_t s3)
    {
        DataType data;
        data.scalar[0] = s0;
@@ -151,6 +154,25 @@ struct vector_type<half, 4>
    }
 };

+template <>
+struct vector_type<half_t, 8>
+{
+    using MemoryType = half8_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        half_t scalar[8];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
+    {
+        static_assert(I < 8, "wrong");
+        *(reinterpret_cast<half_t*>(&v) + I) = s;
+    }
+};
+
 template <>
 struct vector_type<ushort, 1>
 {
@@ -220,6 +242,25 @@ struct vector_type<ushort, 4>
    }
 };

+template <>
+struct vector_type<ushort, 8>
+{
+    using MemoryType = ushort8_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        ushort scalar[8];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    {
+        static_assert(I < 8, "wrong");
+        *(reinterpret_cast<ushort*>(&v) + I) = s;
+    }
+};
+
 // data type conversion
 template <typename T>
 struct type_convert
@@ -250,12 +291,40 @@ struct inner_product_with_conversion
 {
    static constexpr auto convert = type_convert<T>();

+    __device__ T operator()(float4_t a, float4_t b) const
+    {
+        const float* p_a_float = reinterpret_cast<const float*>(&a);
+        const float* p_b_float = reinterpret_cast<const float*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_float[v]) * convert(p_b_float[v]);
+        }
+
+        return acc;
+    }
+
+    __device__ T operator()(float2_t a, float2_t b) const
+    {
+        const float* p_a_float = reinterpret_cast<const float*>(&a);
+        const float* p_b_float = reinterpret_cast<const float*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_float[v]) * convert(p_b_float[v]);
+        }
+
+        return acc;
+    }
+
    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }

    __device__ T operator()(half2_t a, half2_t b) const
    {
-        const half* p_a_half = reinterpret_cast<const half*>(&a);
-        const half* p_b_half = reinterpret_cast<const half*>(&b);
+        const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
+        const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);

        T acc = 0;
        for(index_t v = 0; v < 2; ++v)
@@ -268,8 +337,8 @@ struct inner_product_with_conversion

    __device__ T operator()(half4_t a, half4_t b) const
    {
-        const half* p_a_half = reinterpret_cast<const half*>(&a);
-        const half* p_b_half = reinterpret_cast<const half*>(&b);
+        const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
+        const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);

        T acc = 0;
        for(index_t v = 0; v < 4; ++v)
@@ -279,6 +348,19 @@ struct inner_product_with_conversion
        return acc;
    }

+    __device__ T operator()(half8_t a, half8_t b) const
+    {
+        const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
+        const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 8; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
+
    __device__ T operator()(ushort2_t a, ushort2_t b) const
    {
        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
@@ -305,6 +387,19 @@ struct inner_product_with_conversion
        }
        return acc;
    }
+
+    __device__ T operator()(ushort8_t a, ushort8_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 8; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+        return acc;
+    }
 };

 } // namespace ck

--- a/composable_kernel/include/utility/float_type.nvidia.hpp.in
+++ b/composable_kernel/include/utility/float_type.nvidia.hpp.in
@@ -13,8 +13,18 @@ namespace ck {
 using float2_t = float2;
 using float4_t = float4;

-// float16
+// float
+typedef float float32_t __attribute__((ext_vector_type(32)));
+
+// bfloat16
+typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
+typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
+typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
+
+// fp16
+using half_t  = half;
 using half2_t = half2;
+using half4_t = float2;

 template <class T, index_t N>
 struct vector_type
@@ -81,37 +91,37 @@ struct vector_type<float, 4>
 };

 template <>
-struct vector_type<half, 1>
+struct vector_type<half_t, 1>
 {
-    using MemoryType = half;
+    using MemoryType = half_t;

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
    {
        static_assert(I < 1, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
+        *(reinterpret_cast<half_t*>(&v) + I) = s;
    }
 };

 template <>
-struct vector_type<half, 2>
+struct vector_type<half_t, 2>
 {
    using MemoryType = half2_t;

    union DataType
    {
        MemoryType vector;
-        half scalar[2];
+        half_t scalar[2];
    };

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
    {
        static_assert(I < 2, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
+        *(reinterpret_cast<half_t*>(&v) + I) = s;
    }

-    __host__ __device__ static MemoryType Pack(half s0, half s1)
+    __host__ __device__ static MemoryType Pack(half_t s0, half_t s1)
    {
        DataType data;
        data.scalar[0] = s0;
@@ -140,8 +150,8 @@ struct inner_product_with_conversion

    __device__ T operator()(half2_t a, half2_t b) const
    {
-        const half* p_a_half = reinterpret_cast<const half*>(&a);
-        const half* p_b_half = reinterpret_cast<const half*>(&b);
+        const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
+        const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);

        T acc = 0;
        for(index_t v = 0; v < 2; ++v)
@@ -151,6 +161,19 @@ struct inner_product_with_conversion

        return acc;
    }
+
+    __device__ T operator()(half4_t a, half4_t b) const
+    {
+        const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
+        const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
 };

 } // namespace ck

--- a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
+++ b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
@@ -2,91 +2,159 @@
 #define CK_IN_MEMORY_OPERATION_AMD_HPP

 #include "float_type.hpp"
+
+#if CK_USE_AMD_BUFFER_ADDRESSING
 #include "amd_buffer_addressing.hpp"
+#endif

 namespace ck {

-template <typename T,
-          index_t DataPerAccess,
-          AddressSpace SrcAddressSpace,
-          AddressSpace DstAddressSpace>
-__device__ void set_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+template <typename T>
+__device__ void atomic_add_impl(T* p_dst, T src)
+{
+    atomicAdd(p_dst, src);
+}
+
+// atomicAdd for float does not support vector type
+template <>
+__device__ void atomic_add_impl<float2_t>(float2_t* p_dst, float2_t src)
+{
+    float* p_dst_float       = reinterpret_cast<float*>(p_dst);
+    const float* p_src_float = reinterpret_cast<const float*>(&src);
+
+    for(index_t i = 0; i < 2; ++i)
+    {
+        atomicAdd(&(p_dst_float[i]), p_src_float[i]);
+    }
+}
+
+template <>
+__device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
+{
+    float* p_dst_float       = reinterpret_cast<float*>(p_dst);
+    const float* p_src_float = reinterpret_cast<const float*>(&src);
+
+    for(index_t i = 0; i < 4; ++i)
+    {
+        atomicAdd(&(p_dst_float[i]), p_src_float[i]);
+    }
+}
+
+template <typename T, index_t DataPerAccess>
+struct SetData
 {
    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;

+    // This version is only for compatibility, don't use this version if possible
+    template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
+    __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
+    {
+        *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
+            *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
+    }
+
 #if CK_USE_AMD_BUFFER_ADDRESSING
-    // TODO: use static_if::ElseIf, instead of nested static_if
-    static_if<SrcAddressSpace == AddressSpace::Global &&
-              DstAddressSpace == AddressSpace::Vgpr>{}([&](auto) {
-        // buffer_load requires:
-        //   1) p_src must be in global memory space, d_dst must be vgpr
-        //   2) p_src to be a block-invariant pointer.
-        // It is user's responsibility to make sure that is true.
+    // buffer_load requires:
+    //   1) p_src must be in global memory space, d_dst must be vgpr
+    //   2) p_src to be a block-invariant pointer.
+    // It is user's responsibility to make sure that is true.
+    template <>
+    __device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
+                                                                  index_t src_offset,
+                                                                  T* p_dst,
+                                                                  index_t dst_offset) const
+    {
        *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-            amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
-    }).Else([&](auto) {
-        static_if<SrcAddressSpace == AddressSpace::Vgpr &&
-                  DstAddressSpace == AddressSpace::Global>{}([&](auto) {
-            // buffer_store requires:
-            //   1) p_src must be in vgpr space, d_dst must be global memory
-            //   2) p_dst to be a block-invariant pointer.
-            // It is user's responsibility to make sure that is true.
-            amd_intrinsic_buffer_store<T, DataPerAccess>(
-                *reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
-        }).Else([&](auto) {
-            *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-                *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
-        });
-    });
-#else
-    *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-        *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
+            amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
+    }
+
+    // buffer_store requires:
+    //   1) p_src must be in vgpr space, d_dst must be global memory
+    //   2) p_dst to be a block-invariant pointer.
+    // It is user's responsibility to make sure that is true.
+    template <>
+    __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
+                                                                  index_t src_offset,
+                                                                  T* p_dst,
+                                                                  index_t dst_offset) const
+    {
+        amd_buffer_store<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
+    }
 #endif
-}
+};

-template <typename T,
-          index_t DataPerAccess,
-          AddressSpace SrcAddressSpace,
-          AddressSpace DstAddressSpace>
-__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+template <typename T, index_t DataPerAccess>
+struct AtomicAddData
 {
    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;

-    static_if<SrcAddressSpace == AddressSpace::Vgpr &&
-              DstAddressSpace == AddressSpace::Global>{}([&](auto) {
-#if CK_USE_AMD_BUFFER_ATOMIC_ADD
-        amd_intrinsic_buffer_atomic_add<T, DataPerAccess>(
-            *reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
-#else
-        atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
-                  *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
+    // This version is only for compatibility, don't use this version if possible
+    template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
+    __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
+    {
+        atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
+                        *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
+    }
+
+#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
+    // buffer_atomic_add requires:
+    //   1) p_src must be in vgpr space, d_dst must be global memory
+    //   2) p_dst to be a block-invariant pointer.
+    // It is user's responsibility to make sure that is true.
+    template <>
+    __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
+                                                                  index_t src_offset,
+                                                                  T* p_dst,
+                                                                  index_t dst_offset) const
+    {
+        amd_buffer_atomic_add<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
+    }
 #endif
-    }).Else([&](auto fwd) {
-        static_assert(fwd(false), "atomic_add doesn't support this memory space");
-    });
-}
+};

 template <typename T,
          index_t DataPerAccess,
          AddressSpace SrcAddressSpace,
          AddressSpace DstAddressSpace,
-          InMemoryDataOperation DstInMemOp>
+          InMemoryDataOperation DstInMemOp,
+          index_t SrcDataStride = 1,
+          index_t DstDataStride = 1>
 __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
 {
    static_assert(DstInMemOp == InMemoryDataOperation::Set ||
                      DstInMemOp == InMemoryDataOperation::AtomicAdd,
                  "wrong! InMemoryDataOperation not supported!");

-    // TODO: use static_if::ElseIf
-    static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
-        set_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
-            p_src, src_offset, p_dst, dst_offset);
-    });
+    // keep it simple, don't use static_if here, otherwise compiler will do weird things
+    if(SrcDataStride == 1 && DstDataStride == 1)
+    {
+        // TODO: use static_if::ElseIf
+        static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
+            SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                p_src, src_offset, p_dst, dst_offset);
+        });
+
+        static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
+            AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                p_src, src_offset, p_dst, dst_offset);
+        });
+    }
+    else
+    {
+        for(index_t i = 0; i < DataPerAccess; i++)
+        {
+            // TODO: use static_if::ElseIf
+            static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
+                SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                    p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
+            });

-    static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
-        atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
-            p_src, src_offset, p_dst, dst_offset);
-    });
+            static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
+                AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                    p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
+            });
+        }
+    }
 }

 } // namespace ck

--- a/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
+++ b/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
@@ -3,56 +3,106 @@

 namespace ck {

-template <typename T,
-          index_t DataPerAccess,
-          AddressSpace SrcAddressSpace,
-          AddressSpace DstAddressSpace>
-__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+template <typename T>
+__device__ void atomic_add_impl(T* p_dst, T src)
 {
-    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+    atomicAdd(p_dst, src);
+}

-    *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-        *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
+// atomicAdd for float does not support vector type
+template <>
+__device__ void atomic_add_impl<float2_t>(float2_t* p_dst, float2_t src)
+{
+    float* p_dst_float       = reinterpret_cast<float*>(p_dst);
+    const float* p_src_float = reinterpret_cast<const float*>(&src);
+
+    for(index_t i = 0; i < 2; ++i)
+    {
+        atomicAdd(&(p_dst_float[i]), p_src_float[i]);
+    }
 }

-template <typename T,
-          index_t DataPerAccess,
-          AddressSpace SrcAddressSpace,
-          AddressSpace DstAddressSpace>
-__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+template <>
+__device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
 {
-    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+    float* p_dst_float       = reinterpret_cast<float*>(p_dst);
+    const float* p_src_float = reinterpret_cast<const float*>(&src);

-    static_if<SrcAddressSpace == AddressSpace::Vgpr &&
-              DstAddressSpace == AddressSpace::Global>{}([&](auto) {
-        atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
-                  *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
-    }).Else([&](auto fwd) {
-        static_assert(fwd(false), "atomic_add doesn't support this memory space");
-    });
+    for(index_t i = 0; i < 4; ++i)
+    {
+        atomicAdd(&(p_dst_float[i]), p_src_float[i]);
+    }
 }

+template <typename T, index_t DataPerAccess>
+struct SetData
+{
+    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+
+    template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
+    __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
+    {
+        *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
+            *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
+    }
+};
+
+template <typename T, index_t DataPerAccess>
+struct AtomicAddData
+{
+    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+
+    template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
+    __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
+    {
+        atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
+                        *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
+    }
+};
+
 template <typename T,
          index_t DataPerAccess,
          AddressSpace SrcAddressSpace,
          AddressSpace DstAddressSpace,
-          InMemoryDataOperation DstInMemOp>
+          InMemoryDataOperation DstInMemOp,
+          index_t SrcDataStride = 1,
+          index_t DstDataStride = 1>
 __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
 {
    static_assert(DstInMemOp == InMemoryDataOperation::Set ||
                      DstInMemOp == InMemoryDataOperation::AtomicAdd,
                  "wrong! InMemoryDataOperation not supported!");

-    // TODO: use static_if::ElseIf
-    static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
-        copy_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
-            p_src, src_offset, p_dst, dst_offset);
-    });
+    // keep it simple, don't use static_if here, otherwise compiler will do weird things
+    if(SrcDataStride == 1 && DstDataStride == 1)
+    {
+        // TODO: use static_if::ElseIf
+        static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
+            SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                p_src, src_offset, p_dst, dst_offset);
+        });
+
+        static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
+            AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                p_src, src_offset, p_dst, dst_offset);
+        });
+    }
+    else
+    {
+        for(index_t i = 0; i < DataPerAccess; i++)
+        {
+            // TODO: use static_if::ElseIf
+            static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
+                SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                    p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
+            });

-    static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
-        atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
-            p_src, src_offset, p_dst, dst_offset);
-    });
+            static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
+                AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                    p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
+            });
+        }
+    }
 }

 } // namespace ck

--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -3,6 +3,7 @@

 #include "config.hpp"
 #include "integral_constant.hpp"
+#include "number.hpp"
 #include "type.hpp"

 namespace ck {

--- a/composable_kernel/include/utility/synchronization.amd.hpp.in
+++ b/composable_kernel/include/utility/synchronization.amd.hpp.in
+#ifndef CK_SYNCHRONIZATION_AMD_HPP
+#define CK_SYNCHRONIZATION_AMD_HPP
+
+#include "config.hpp"
+
+namespace ck {
+
+__device__ void __llvm_amdgcn_s_barrier() __asm("llvm.amdgcn.s.barrier");
+
+__device__ void block_sync_lds()
+{
+#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
+    asm volatile("\
+    s_waitcnt lgkmcnt(0) \n \
+    s_barrier \
+    " ::);
+#else
+    __llvm_amdgcn_s_barrier();
+#endif
+}
+
+__device__ void block_sync_lds_vmem() { __llvm_amdgcn_s_barrier(); }
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/synchronization.nvidia.hpp.in
+++ b/composable_kernel/include/utility/synchronization.nvidia.hpp.in
+#ifndef CK_SYNCHRONIZATION_NVIDIA_HPP
+#define CK_SYNCHRONIZATION_NVIDIA_HPP
+
+#include "config.hpp"
+
+namespace ck {
+
+__device__ void block_sync_lds() { __syncthreads(); }
+
+__device__ void block_sync_lds_vmem() { __syncthreads(); }
+
+} // namespace ck
+#endif
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
 set(TENSOR_SOURCE 
-    src/tensor.cpp;
+    src/host_tensor.cpp;
    src/device.cpp;
 )

@@ -25,8 +25,6 @@ elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
 endif()

 add_executable(conv_driver ${CONV_SOURCE}) 
-add_executable(col2im_driver ${COL2IM_SOURCE}) 
 add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE}) 
 target_link_libraries(conv_driver PRIVATE host)
-target_link_libraries(col2im_driver PRIVATE host)
 target_link_libraries(conv_bwd_data_driver PRIVATE host)
--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
 #ifndef CONV_COMMON_HPP
 #define CONV_COMMON_HPP

-#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "tensor_descriptor.hpp"

-template <class InDesc,
-          class WeiDesc,
-          class ConvStrides,
-          class ConvDilations,
-          class LowerPads,
-          class UpperPads>
-constexpr auto get_convolution_output_default_4d_tensor_descriptor_deprecated(
-    InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads)
-{
-    using namespace ck;
-
-    constexpr auto in_desc  = InDesc{};
-    constexpr auto wei_desc = WeiDesc{};
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    static_assert(in_desc.GetNumOfDimension() == 4, "input nDim is not 4");
-    static_assert(wei_desc.GetNumOfDimension() == 4, "weight nDim is not 4");
-    static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
-                  "input & weight dimension not consistent");
-
-    constexpr index_t N  = in_desc.GetLength(I0);
-    constexpr index_t Hi = in_desc.GetLength(I2);
-    constexpr index_t Wi = in_desc.GetLength(I3);
-
-    constexpr index_t K = wei_desc.GetLength(I0);
-    constexpr index_t Y = wei_desc.GetLength(I2);
-    constexpr index_t X = wei_desc.GetLength(I3);
-
-    constexpr index_t HPadLow = LowerPads{}.Get(I0);
-    constexpr index_t WPadLow = LowerPads{}.Get(I1);
-
-    constexpr index_t HPadUp = UpperPads{}.Get(I0);
-    constexpr index_t WPadUp = UpperPads{}.Get(I1);
-
-    constexpr index_t YEff = (Y - 1) * ConvDilations{}[0] + 1;
-    constexpr index_t XEff = (X - 1) * ConvDilations{}[1] + 1;
-
-    constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1;
-    constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1;
-
-    return make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
-}
-
 template <class InDesc,
          class WeiDesc,
          class ConvStrides,

--- a/driver/include/device.hpp
+++ b/driver/include/device.hpp
@@ -60,7 +60,7 @@ float launch_and_time_kernel(F kernel,

    timer.End();

-    hipGetErrorString(hipGetLastError());
+    hipGetLastError();

    return timer.GetElapsedTime();
 }
@@ -101,8 +101,6 @@ float launch_and_time_kernel(F kernel,

    timer.End();

-    checkCudaErrors(error);
-
    return timer.GetElapsedTime();
 }
 #endif

--- a/driver/include/device_col2im_eb_nchw.hpp
+++ b/driver/include/device_col2im_eb_nchw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_operation_wrapper.hpp"
-#include "gridwise_col2im_eb_nchw.hpp"
-
-template <typename T,
-          typename ColDesc,
-          typename ImgDesc,
-          typename FilterSizes,
-          typename OutputSizes,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename LeftPads,
-          typename RightPads>
-void device_col2im_eb_nchw(ColDesc,
-                           const Tensor<T>& col_eb,
-                           ImgDesc,
-                           Tensor<T>& img_nchw,
-                           FilterSizes,
-                           OutputSizes,
-                           ConvStrides,
-                           ConvDilations,
-                           LeftPads,
-                           RightPads,
-                           std::size_t nrepeat)
-{
-    using namespace ck;
-
-    constexpr auto col_eb_desc   = ColDesc{};
-    constexpr auto img_nchw_desc = ImgDesc{};
-
-    constexpr index_t N  = img_nchw_desc.GetLengths()[0];
-    constexpr index_t C  = img_nchw_desc.GetLengths()[1];
-    constexpr index_t Hi = img_nchw_desc.GetLengths()[2];
-    constexpr index_t Wi = img_nchw_desc.GetLengths()[3];
-
-    constexpr index_t E = col_eb_desc.GetLengths()[0];
-    constexpr index_t B = col_eb_desc.GetLengths()[1];
-
-    std::size_t data_sz = sizeof(T);
-    DeviceMem col_eb_device_buf(data_sz * col_eb.mDesc.GetElementSpace());
-    DeviceMem img_nchw_device_buf(data_sz * img_nchw.mDesc.GetElementSpace());
-
-    col_eb_device_buf.ToDevice(col_eb.mData.data());
-    img_nchw_device_buf.ToDevice(img_nchw.mData.data());
-
-#if 1
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t EPerBlock = 128;
-    constexpr index_t BPerBlock = 128;
-
-    using BlockCopySubLengths_E_B            = Sequence<8, 8>;
-    using BlockCopyClusterLengths_E_B        = Sequence<16, 16>;
-    using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
-    using BlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
-    using BlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
-
-    constexpr index_t BlockCopyDataPerAccess_B = 1;
-#endif
-
-    constexpr index_t GridSize =
-        ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    constexpr auto gridwise_col2im = GridwiseCol2Im_eb_nchw<GridSize,
-                                                            BlockSize,
-                                                            T,
-                                                            ColDesc,
-                                                            ImgDesc,
-                                                            FilterSizes,
-                                                            OutputSizes,
-                                                            ConvStrides,
-                                                            ConvDilations,
-                                                            LeftPads,
-                                                            RightPads,
-                                                            EPerBlock,
-                                                            BPerBlock,
-                                                            BlockCopySubLengths_E_B,
-                                                            BlockCopyClusterLengths_E_B,
-                                                            BlockCopyThreadClusterArrangeOrder,
-                                                            BlockCopySrcAccessOrder,
-                                                            BlockCopyDstAccessOrder,
-                                                            BlockCopyDataPerAccess_B>{};
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        float time =
-            launch_and_time_kernel(run_gridwise_operation<decltype(gridwise_col2im),
-                                                          const T* const __restrict__,
-                                                          T* const __restrict__>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   gridwise_col2im,
-                                   const_cast<const T* const __restrict__>(
-                                       static_cast<T*>(col_eb_device_buf.GetDeviceBuffer())),
-                                   const_cast<T* const __restrict__>(
-                                       static_cast<T*>(img_nchw_device_buf.GetDeviceBuffer())));
-
-        printf("Elapsed time : %f ms\n", time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-
-    img_nchw_device_buf.FromDevice(img_nchw.mData.data());
-}
--- a/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "tensor.hpp"
+#include "host_tensor.hpp"
 #include "gridwise_operation_wrapper.hpp"
 #include "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"

@@ -49,16 +49,16 @@ void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc i
    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());

-#if 0
+#if 1
    // BlockSize = 256, each thread hold 64 data
    constexpr index_t BlockSize = 256;

    constexpr index_t GemmMPerBlock              = 128;
    constexpr index_t GemmNPerBlock              = 128;
    constexpr index_t GemmKPerBlock              = 8;
-    constexpr index_t GemmMPerThread         = 4;
-    constexpr index_t GemmNPerThread         = 4;
-    constexpr index_t GemmKPerThread         = 1;
+    constexpr index_t GemmMPerThread             = 4;
+    constexpr index_t GemmNPerThread             = 4;
+    constexpr index_t GemmKPerThread             = 1;
    constexpr index_t GemmMLevel0Cluster         = 4;
    constexpr index_t GemmNLevel0Cluster         = 4;
    constexpr index_t GemmMLevel1Cluster         = 4;
@@ -83,6 +83,36 @@ void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc i
    // BlockSize = 256, each thread hold 64 data
    constexpr index_t BlockSize = 256;

+    constexpr index_t GemmMPerBlock              = 128;
+    constexpr index_t GemmNPerBlock              = 128;
+    constexpr index_t GemmKPerBlock              = 8;
+    constexpr index_t GemmMPerThread             = 4;
+    constexpr index_t GemmNPerThread             = 4;
+    constexpr index_t GemmKPerThread             = 1;
+    constexpr index_t GemmMLevel0Cluster         = 4;
+    constexpr index_t GemmNLevel0Cluster         = 4;
+    constexpr index_t GemmMLevel1Cluster         = 4;
+    constexpr index_t GemmNLevel1Cluster         = 4;
+    constexpr index_t GemmThreadGemmDataPerReadM = 4;
+    constexpr index_t GemmThreadGemmDataPerReadN = 4;
+
+    using GemmABlockCopyThreadSliceLengths_GemmK_GemmM   = Sequence<1, 4>;
+    using GemmABlockCopyThreadClusterLengths_GemmK_GemmM = Sequence<8, 32>;
+
+    constexpr index_t GemmABlockCopySrcDataPerRead_GemmM  = 4;
+    constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM = 4;
+
+    using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN   = Sequence<1, 4>;
+    using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<8, 32>;
+
+    constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN  = 4;
+    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 4;
+
+    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 4;
+#elif 1
+    // BlockSize = 256, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+
    constexpr index_t GemmMPerBlock              = 128;
    constexpr index_t GemmNPerBlock              = 128;
    constexpr index_t GemmKPerBlock              = 16;
@@ -119,7 +149,7 @@ void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc i

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

-    constexpr auto gridwise_conv = GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw<
+    using gridwise_conv_bwd_data = GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw<
        GridSize,
        BlockSize,
        T,
@@ -151,28 +181,38 @@ void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc i
        GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
        GemmBBlockCopySrcDataPerRead_GemmN,
        GemmBBlockCopyDstDataPerWrite_GemmN,
-        GemmCThreadCopyDstDataPerWrite_GemmN1>{};
+        GemmCThreadCopyDstDataPerWrite_GemmN1>;

-    for(index_t i = 0; i < nrepeat; ++i)
+    for(index_t i = 0; i < 5; ++i)
    {
-        float time = launch_and_time_kernel(run_gridwise_operation<decltype(gridwise_conv),
-                                                                   T* const __restrict__,
-                                                                   const T* const __restrict__,
-                                                                   const T* const __restrict__>,
-                                            dim3(GridSize),
-                                            dim3(BlockSize),
-                                            0,
-                                            0,
-                                            gridwise_conv,
-                                            static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                            static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                                            static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
+        std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+        KernelTimer timer;
+        timer.Start();
+
+        for(index_t j = 0; j < nrepeat; ++j)
+        {
+            launch_kernel(run_gridwise_operation<gridwise_conv_bwd_data,
+                                                 T* const __restrict__,
+                                                 const T* const __restrict__,
+                                                 const T* const __restrict__>,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          0,
+                          0,
+                          static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                          static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                          static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+        }
+
+        timer.End();
+
+        float ave_time = timer.GetElapsedTime() / nrepeat;
+
+        float perf = (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
    }

    in_nchw_device_buf.FromDevice(in_nchw.mData.data());

--- a/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "tensor.hpp"
+#include "host_tensor.hpp"
 #include "gridwise_operation_wrapper.hpp"
 #include "gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp"

@@ -55,25 +55,27 @@ void device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw(InDesc i

    constexpr index_t BPerBlock = 32;
    constexpr index_t EPerBlock = 32;
-    constexpr index_t KPerBlock = 8;
+    constexpr index_t KPerBlock = 16;
+
+    constexpr index_t GemmMPerThread = 4;
+    constexpr index_t GemmNPerThread = 4;
+    constexpr index_t GemmKPerThread = 1;

-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
    constexpr index_t GemmMLevel0Cluster = 4;
    constexpr index_t GemmNLevel0Cluster = 4;
    constexpr index_t GemmMLevel1Cluster = 4;
    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;

-    using OutBlockCopySubLengths_K_B_N0     = Sequence<1, 1, 4>;
+    constexpr index_t GemmDataPerReadA = 4;
+    constexpr index_t GemmDataPerReadB = 4;
+
+    using OutBlockCopySubLengths_K_B_N0     = Sequence<2, 1, 4>;
    using OutBlockCopyClusterLengths_K_B_N0 = Sequence<8, 32, 1>;

    constexpr index_t OutBlockCopySrcDataPerRead_B   = 1;
    constexpr index_t OutBlockCopyDstDataPerWrite_N0 = 4;

-    using WeiBlockCopySubLengths_K_E_C0     = Sequence<1, 4, 1>;
+    using WeiBlockCopySubLengths_K_E_C0     = Sequence<2, 4, 1>;
    using WeiBlockCopyClusterLengths_K_E_C0 = Sequence<8, 8, 4>;

    constexpr index_t WeiBlockCopySrcDataPerRead_E   = 4;
@@ -82,8 +84,8 @@ void device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw(InDesc i
    constexpr index_t InThreadCopyDstDataPerWrite_B = 1;
 #endif

-    constexpr index_t C0 = GemmMPerThreadSubC;
-    constexpr index_t N0 = GemmNPerThreadSubC;
+    constexpr index_t C0 = GemmMPerThread;
+    constexpr index_t N0 = GemmNPerThread;

    constexpr index_t C1 = C / C0;
    constexpr index_t N1 = N / N0;
@@ -96,7 +98,7 @@ void device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw(InDesc i

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

-    constexpr auto gridwise_conv =
+    using gridwise_conv_bwd_data =
        GridwiseConvolutionBackwardDataImplicitGemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer<
            GridSize,
            BlockSize,
@@ -112,13 +114,13 @@ void device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw(InDesc i
            EPerBlock,
            BPerBlock,
            KPerBlock,
-            GemmMPerThreadSubC,
-            GemmNPerThreadSubC,
+            GemmMPerThread,
+            GemmNPerThread,
+            GemmKPerThread,
            GemmMLevel0Cluster,
            GemmNLevel0Cluster,
            GemmMLevel1Cluster,
            GemmNLevel1Cluster,
-            GemmKPerThreadLoop,
            GemmDataPerReadA,
            GemmDataPerReadB,
            OutBlockCopySubLengths_K_B_N0,
@@ -129,28 +131,38 @@ void device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw(InDesc i
            WeiBlockCopyClusterLengths_K_E_C0,
            WeiBlockCopySrcDataPerRead_E,
            WeiBlockCopyDstDataPerWrite_C0,
-            InThreadCopyDstDataPerWrite_B>{};
+            InThreadCopyDstDataPerWrite_B>;

-    for(index_t i = 0; i < nrepeat; ++i)
+    for(index_t i = 0; i < 5; ++i)
    {
-        float time = launch_and_time_kernel(run_gridwise_operation<decltype(gridwise_conv),
-                                                                   T* const __restrict__,
-                                                                   const T* const __restrict__,
-                                                                   const T* const __restrict__>,
-                                            dim3(GridSize),
-                                            dim3(BlockSize),
-                                            0,
-                                            0,
-                                            gridwise_conv,
-                                            static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                            static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                                            static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
+        std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+        KernelTimer timer;
+        timer.Start();
+
+        for(index_t j = 0; j < nrepeat; ++j)
+        {
+            launch_kernel(run_gridwise_operation<gridwise_conv_bwd_data,
+                                                 T* const __restrict__,
+                                                 const T* const __restrict__,
+                                                 const T* const __restrict__>,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          0,
+                          0,
+                          static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                          static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                          static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+        }
+
+        timer.End();
+
+        float ave_time = timer.GetElapsedTime() / nrepeat;
+
+        float perf = (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
    }

    in_nchw_device_buf.FromDevice(in_nchw.mData.data());

--- a/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "tensor.hpp"
+#include "host_tensor.hpp"
 #include "gridwise_operation_wrapper.hpp"
 #include "gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp"

@@ -185,7 +185,7 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

-    constexpr auto gridwise_conv = GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw<
+    using gridwise_conv_bwd_data = GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw<
        GridSize,
        BlockSize,
        T,
@@ -217,28 +217,38 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i
        GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
        GemmBBlockCopySrcDataPerRead_GemmN,
        GemmBBlockCopyDstDataPerWrite_GemmN,
-        GemmCThreadCopyDstDataPerWrite_GemmN1>{};
+        GemmCThreadCopyDstDataPerWrite_GemmN1>;

-    for(index_t i = 0; i < nrepeat; ++i)
+    for(index_t i = 0; i < 5; ++i)
    {
-        float time = launch_and_time_kernel(run_gridwise_operation<decltype(gridwise_conv),
-                                                                   T* const __restrict__,
-                                                                   const T* const __restrict__,
-                                                                   const T* const __restrict__>,
-                                            dim3(GridSize),
-                                            dim3(BlockSize),
-                                            0,
-                                            0,
-                                            gridwise_conv,
-                                            static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                            static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                                            static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
+        std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+        KernelTimer timer;
+        timer.Start();
+
+        for(index_t j = 0; j < nrepeat; ++j)
+        {
+            launch_kernel(run_gridwise_operation<gridwise_conv_bwd_data,
+                                                 T* const __restrict__,
+                                                 const T* const __restrict__,
+                                                 const T* const __restrict__>,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          0,
+                          0,
+                          static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                          static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                          static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+        }
+
+        timer.End();
+
+        float ave_time = timer.GetElapsedTime() / nrepeat;
+
+        float perf = (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
    }

    in_nchw_device_buf.FromDevice(in_nchw.mData.data());

--- a/driver/include/device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "tensor.hpp"
+#include "host_tensor.hpp"
 #include "gridwise_operation_wrapper.hpp"
 #include "gridwise_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp"

@@ -124,7 +124,7 @@ void device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw(InDesc i

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

-    constexpr auto gridwise_conv = GridwiseConvolutionBackwardDataImplicitGemm_v3r1_nchw_kcyx_nkhw<
+    using gridwise_conv_bwd_data = GridwiseConvolutionBackwardDataImplicitGemm_v3r1_nchw_kcyx_nkhw<
        GridSize,
        BlockSize,
        T,
@@ -156,28 +156,38 @@ void device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw(InDesc i
        GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
        GemmBBlockCopySrcDataPerRead_GemmN,
        GemmBBlockCopyDstDataPerWrite_GemmN,
-        GemmCThreadCopyDstDataPerWrite_GemmN1>{};
+        GemmCThreadCopyDstDataPerWrite_GemmN1>;

-    for(index_t i = 0; i < nrepeat; ++i)
+    for(index_t i = 0; i < 5; ++i)
    {
-        float time = launch_and_time_kernel(run_gridwise_operation<decltype(gridwise_conv),
-                                                                   T* const __restrict__,
-                                                                   const T* const __restrict__,
-                                                                   const T* const __restrict__>,
-                                            dim3(GridSize),
-                                            dim3(BlockSize),
-                                            0,
-                                            0,
-                                            gridwise_conv,
-                                            static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                            static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                                            static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
+        std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+        KernelTimer timer;
+        timer.Start();
+
+        for(index_t j = 0; j < nrepeat; ++j)
+        {
+            launch_kernel(run_gridwise_operation<gridwise_conv_bwd_data,
+                                                 T* const __restrict__,
+                                                 const T* const __restrict__,
+                                                 const T* const __restrict__>,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          0,
+                          0,
+                          static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                          static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                          static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+        }
+
+        timer.End();
+
+        float ave_time = timer.GetElapsedTime() / nrepeat;
+
+        float perf = (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
    }

    in_nchw_device_buf.FromDevice(in_nchw.mData.data());

--- a/driver/include/device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "tensor.hpp"
+#include "host_tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
 #include "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"

 namespace launcher {

 using namespace ck;

-template <typename GridwiseOp, index_t GemmId, typename... Xs>
-__global__ void run_gridwise_convolution_backward_data_v4r1(Xs... xs)
-{
-    GridwiseOp::template Run<GemmId>(xs...);
-}
-
 template <typename T,
          typename InDesc,
          typename WeiDesc,
@@ -91,36 +86,6 @@ void device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc i
    constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN  = 1;
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;

-    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
-#elif 1
-    // BlockSize = 256, each thread hold 64 data
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t GemmMPerBlock              = 128;
-    constexpr index_t GemmNPerBlock              = 128;
-    constexpr index_t GemmKPerBlock              = 16;
-    constexpr index_t GemmMPerThread             = 4;
-    constexpr index_t GemmNPerThread             = 4;
-    constexpr index_t GemmKPerThread             = 1;
-    constexpr index_t GemmMLevel0Cluster         = 4;
-    constexpr index_t GemmNLevel0Cluster         = 4;
-    constexpr index_t GemmMLevel1Cluster         = 4;
-    constexpr index_t GemmNLevel1Cluster         = 4;
-    constexpr index_t GemmThreadGemmDataPerReadM = 4;
-    constexpr index_t GemmThreadGemmDataPerReadN = 4;
-
-    using GemmABlockCopyThreadSliceLengths_GemmK_GemmM   = Sequence<8, 1>;
-    using GemmABlockCopyThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
-
-    constexpr index_t GemmABlockCopySrcDataPerRead_GemmM  = 1;
-    constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM = 1;
-
-    using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN   = Sequence<8, 1>;
-    using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<2, 128>;
-
-    constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN  = 1;
-    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
-
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
 #endif

@@ -157,78 +122,82 @@ void device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc i

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

-    for(index_t i = 0; i < nrepeat; ++i)
+    for(index_t i = 0; i < 5; ++i)
    {
-        using GridwiseConvBwdData = GridwiseConvolutionBackwardDataImplicitGemm_v4r1_nchw_kcyx_nkhw<
-            GridSize,
-            BlockSize,
-            T,
-            T,
-            decltype(in_nchw_desc),
-            decltype(wei_kcyx_desc),
-            decltype(out_nkhw_desc),
-            ConvStrides,
-            ConvDilations,
-            InLeftPads,
-            InRightPads,
-            GemmMPerBlock,
-            GemmNPerBlock,
-            GemmKPerBlock,
-            GemmMPerThread,
-            GemmNPerThread,
-            GemmKPerThread,
-            GemmMLevel0Cluster,
-            GemmNLevel0Cluster,
-            GemmMLevel1Cluster,
-            GemmNLevel1Cluster,
-            GemmThreadGemmDataPerReadM,
-            GemmThreadGemmDataPerReadN,
-            GemmABlockCopyThreadSliceLengths_GemmK_GemmM,
-            GemmABlockCopyThreadClusterLengths_GemmK_GemmM,
-            GemmABlockCopySrcDataPerRead_GemmM,
-            GemmABlockCopyDstDataPerWrite_GemmM,
-            GemmBBlockCopyThreadSliceLengths_GemmK_GemmN,
-            GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
-            GemmBBlockCopySrcDataPerRead_GemmN,
-            GemmBBlockCopyDstDataPerWrite_GemmN,
-            GemmCThreadCopyDstDataPerWrite_GemmN1>;
+        std::cout << "Start running " << nrepeat << " times..." << std::endl;

        KernelTimer timer;
        timer.Start();

-        static_for<0, GridwiseConvBwdData::GetNumberOfGemm(), 1>{}([&](auto gemm_id_) {
-            constexpr index_t gemm_id = decltype(gemm_id_){};
-
-            constexpr auto gemm_sizes        = GridwiseConvBwdData::GetGemmSize(gemm_id);
-            constexpr index_t gemm_k         = gemm_sizes.At(2);
-            constexpr bool is_gemm_not_empty = gemm_k > 0;
-
-            // only compile and run if GEMM is no empty
-            static_if<is_gemm_not_empty>{}([&](auto fwd) {
-                launch_kernel(
-                    run_gridwise_convolution_backward_data_v4r1<GridwiseConvBwdData,
-                                                                fwd(gemm_id),
-                                                                T* const __restrict__,
-                                                                const T* const __restrict__,
-                                                                const T* const __restrict__>,
-                    dim3(GridSize),
-                    dim3(BlockSize),
-                    0,
-                    0,
-                    static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                    static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                    static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+        for(index_t i = 0; i < nrepeat; ++i)
+        {
+            using GridwiseConvBwdData =
+                GridwiseConvolutionBackwardDataImplicitGemm_v4r1_nchw_kcyx_nkhw<
+                    GridSize,
+                    BlockSize,
+                    T,
+                    T,
+                    decltype(in_nchw_desc),
+                    decltype(wei_kcyx_desc),
+                    decltype(out_nkhw_desc),
+                    ConvStrides,
+                    ConvDilations,
+                    InLeftPads,
+                    InRightPads,
+                    GemmMPerBlock,
+                    GemmNPerBlock,
+                    GemmKPerBlock,
+                    GemmMPerThread,
+                    GemmNPerThread,
+                    GemmKPerThread,
+                    GemmMLevel0Cluster,
+                    GemmNLevel0Cluster,
+                    GemmMLevel1Cluster,
+                    GemmNLevel1Cluster,
+                    GemmThreadGemmDataPerReadM,
+                    GemmThreadGemmDataPerReadN,
+                    GemmABlockCopyThreadSliceLengths_GemmK_GemmM,
+                    GemmABlockCopyThreadClusterLengths_GemmK_GemmM,
+                    GemmABlockCopySrcDataPerRead_GemmM,
+                    GemmABlockCopyDstDataPerWrite_GemmM,
+                    GemmBBlockCopyThreadSliceLengths_GemmK_GemmN,
+                    GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
+                    GemmBBlockCopySrcDataPerRead_GemmN,
+                    GemmBBlockCopyDstDataPerWrite_GemmN,
+                    GemmCThreadCopyDstDataPerWrite_GemmN1>;
+
+            static_for<0, GridwiseConvBwdData::GetNumberOfGemm(), 1>{}([&](auto gemm_id) {
+                constexpr auto gemm_sizes        = GridwiseConvBwdData::GetGemmSize(gemm_id);
+                constexpr index_t gemm_k         = gemm_sizes.At(2);
+                constexpr bool is_gemm_not_empty = gemm_k > 0;
+
+                // only compile and run if GEMM is no empty
+                static_if<is_gemm_not_empty>{}([&](auto fwd) {
+                    launch_kernel(run_gridwise_operation<GridwiseConvBwdData,
+                                                         T* const __restrict__,
+                                                         const T* const __restrict__,
+                                                         const T* const __restrict__,
+                                                         decltype(gemm_id)>,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                                  static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                                  static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()),
+                                  fwd(gemm_id));
+                });
            });
-        });
+        }

        timer.End();
-        float time = timer.GetElapsedTime();

-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
+        float ave_time = timer.GetElapsedTime() / nrepeat;
+
+        float perf = (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
    }

    in_nchw_device_buf.FromDevice(in_nchw.mData.data());

--- a/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
-
-using namespace ck;
-
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
-                                                 const Tensor<T>& in,
-                                                 WeiDesc,
-                                                 const Tensor<T>& wei,
-                                                 OutDesc,
-                                                 Tensor<T>& out,
-                                                 index_t nrepeat)
-{
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
-
-    int num_thread = std::thread::hardware_concurrency();
-
-    in_device_buf.ToDevice(in.mData.data());
-    wei_device_buf.ToDevice(wei.mData.data());
-    out_device_buf.ToDevice(out.mData.data());
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_desc  = InDesc{};
-    constexpr auto wei_desc = WeiDesc{};
-    constexpr auto out_desc = OutDesc{};
-
-#if 1
-    // 3x3, 34x34, 128 thread
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 32;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 32;
-
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 4;
-    constexpr index_t CPerThread  = 2;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t InBlockCopyDataPerRead  = 1;
-    constexpr index_t WeiBlockCopyDataPerRead = 1;
-
-    constexpr index_t BlockSize = 128;
-#endif
-
-    constexpr index_t GridSize =
-        (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
-        (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        using gridwise_conv = GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw<GridSize,
-                                                                          BlockSize,
-                                                                          T,
-                                                                          InDesc,
-                                                                          WeiDesc,
-                                                                          OutDesc,
-                                                                          NPerBlock,
-                                                                          KPerBlock,
-                                                                          CPerBlock,
-                                                                          HoPerBlock,
-                                                                          WoPerBlock,
-                                                                          NPerThread,
-                                                                          KPerThread,
-                                                                          CPerThread,
-                                                                          HoPerThread,
-                                                                          WoPerThread,
-                                                                          InBlockCopyDataPerRead,
-                                                                          WeiBlockCopyDataPerRead>;
-        float time = launch_and_time_kernel(run_gridwise_convolution_kernel<gridwise_conv, T>,
-                                            dim3(GridSize),
-                                            dim3(BlockSize),
-                                            0,
-                                            static_cast<T*>(in_device_buf.GetDeviceBuffer()),
-                                            static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
-                                            static_cast<T*>(out_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms\n", time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-
-    out_device_buf.FromDevice(out.mData.data());
-}
--- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
-
-using namespace ck;
-
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
-                                                        const Tensor<T>& in_nchw,
-                                                        WeiDesc,
-                                                        const Tensor<T>& wei_kcyx,
-                                                        OutDesc,
-                                                        Tensor<T>& out_nkhw,
-                                                        index_t nrepeat)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-
-    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
-    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
-
-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
-
-    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
-        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
-        std::thread::hardware_concurrency());
-
-    // reorder input
-    auto in_chwn_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Hi, Wi, N>{});
-    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
-
-    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
-
-    auto f_reorder_nchw2chwn = [&](auto n, auto c, auto hi, auto wi) {
-        in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_nchw2chwn, N, C, Hi, Wi)(
-        std::thread::hardware_concurrency());
-
-    // output
-    auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
-    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
-
-    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
-
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_chwn_device_buf(data_sz * in_chwn.mDesc.GetElementSpace());
-    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
-    DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
-
-    in_chwn_device_buf.ToDevice(in_chwn.mData.data());
-    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
-    out_khwn_device_buf.ToDevice(out_khwn.mData.data());
-
-#if 0
-    // for 3x3, 34x34, v1r1, Pascal
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 2;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopyClusterLengths_CHWN       = Sequence<4, 4, 2, 4>;
-    constexpr index_t InBlockCopyDataPerAccess_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerAccess_N = 2;
-#elif 1
-    // for 3x3, 34x34, v1r3, Pascal
-    // for 3x3, 28x28, v1r3, Pascal
-    // for 3x3, 14x14, v1r3, Pascal
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopySubLengths_CHWN             = Sequence<1, 1, 1, 4>;
-    using InBlockCopyClusterLengths_CHWN         = Sequence<8, 2, 2, 4>;
-    constexpr index_t InBlockCopyDataPerAccess_N = 4;
-
-    using WeiBlockCopySubLengths_CK               = Sequence<2, 4>;
-    using WeiBlockCopyClusterLengths_CK           = Sequence<4, 32>;
-    constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerAccess_N = 2;
-#elif 0
-    // for 3x3, 34x34, v1r1, Vega 20
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopyClusterLengths_CHWN         = Sequence<4, 4, 2, 8>;
-    constexpr index_t InBlockCopyDataPerAccess_N = 2;
-
-    constexpr index_t WeiBlockCopyDataPerAccess_K = 2;
-
-    constexpr index_t OutThreadCopyDataPerAccess_N = 4;
-#elif 1
-    // for 3x3, 34x34, v1r3, Vega 20
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopySubLengths_CHWN             = Sequence<1, 1, 1, 4>;
-    using InBlockCopyClusterLengths_CHWN         = Sequence<8, 2, 4, 4>;
-    constexpr index_t InBlockCopyDataPerAccess_N = 4;
-
-    using WeiBlockCopySubLengths_CK               = Sequence<1, 4>;
-    using WeiBlockCopyClusterLengths_CK           = Sequence<8, 32>;
-    constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerAccess_N = 4;
-#elif 0
-    // for 3x3, 56x56, v1r1, Pascal
-    constexpr index_t NPerBlock  = 32;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t InBlockCopy_ThreadPerDimC  = 1;
-    constexpr index_t InBlockCopy_ThreadPerDimH  = 4;
-    constexpr index_t InBlockCopy_ThreadPerDimW  = 4;
-    constexpr index_t InBlockCopy_ThreadPerDimN  = 8;
-    constexpr index_t InBlockCopyDataPerAccess_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 2;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-
-    constexpr index_t OutThreadCopyDataPerAccess_N = 2;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 3x3, 56x56, v1r2, Pascal
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 1;
-    constexpr index_t GemmDataPerReadB   = 1;
-
-    constexpr index_t InBlockCopy_ThreadPerDimC  = 1;
-    constexpr index_t InBlockCopy_ThreadPerDimH  = 2;
-    constexpr index_t InBlockCopy_ThreadPerDimW  = 4;
-    constexpr index_t InBlockCopy_ThreadPerDimN  = 4;
-    constexpr index_t InBlockCopyDataPerAccess_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerAccess_K  = 4;
-    constexpr index_t OutThreadCopyDataPerAccess_N = 4;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 3x3, 28x28, v1r1, Pacal
-    constexpr index_t NPerBlock  = 32;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t InBlockCopy_ThreadPerDimC  = 1;
-    constexpr index_t InBlockCopy_ThreadPerDimH  = 4;
-    constexpr index_t InBlockCopy_ThreadPerDimW  = 4;
-    constexpr index_t InBlockCopy_ThreadPerDimN  = 8;
-    constexpr index_t InBlockCopyDataPerAccess_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 2;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    constexpr index_t OutThreadCopyDataPerAccess_N = 2;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 3x3, 28x28, v1r2, Pascal
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopyClusterLengths_CHWN         = Sequence<4, 2, 4, 4>;
-    constexpr index_t InBlockCopyDataPerAccess_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerAccess_N = 2;
-#elif 0
-    // for 1x1, 28x28, v1r1, Pascal
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t InBlockCopy_ThreadPerDimC  = 8;
-    constexpr index_t InBlockCopy_ThreadPerDimH  = 2;
-    constexpr index_t InBlockCopy_ThreadPerDimW  = 2;
-    constexpr index_t InBlockCopy_ThreadPerDimN  = 4;
-    constexpr index_t InBlockCopyDataPerAccess_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 2;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-
-    constexpr index_t OutThreadCopyDataPerAccess_N = 2;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 1x1, 14x14, v1r1, Pascal
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 8;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-
-    constexpr index_t InBlockCopy_ThreadPerDimC  = 8;
-    constexpr index_t InBlockCopy_ThreadPerDimH  = 2;
-    constexpr index_t InBlockCopy_ThreadPerDimW  = 2;
-    constexpr index_t InBlockCopy_ThreadPerDimN  = 4;
-    constexpr index_t InBlockCopyDataPerAccess_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerAccess_K  = 4;
-    constexpr index_t OutThreadCopyDataPerAccess_N = 2;
-
-    constexpr index_t BlockSize = 128;
-#endif
-
-    constexpr index_t GridSize =
-        (N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    constexpr auto gridwise_conv =
-#if 0
-        GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
-#elif 0
-        GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
-#elif 0
-        GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
-#elif 1
-        GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
-#endif
-        <GridSize,
-         BlockSize,
-         T,
-         decltype(in_chwn_desc),
-         decltype(wei_cyxk_desc),
-         decltype(out_khwn_desc),
-         NPerBlock,
-         KPerBlock,
-         CPerBlock,
-         HoPerBlock,
-         WoPerBlock,
-         NPerThread,
-         KPerThread,
-         HoPerThread,
-         WoPerThread,
-         GemmMPerThreadSubC,
-         GemmNPerThreadSubC,
-         GemmMLevel0Cluster,
-         GemmNLevel0Cluster,
-         GemmMLevel1Cluster,
-         GemmNLevel1Cluster,
-         GemmKPerThreadLoop,
-         GemmDataPerReadA,
-         GemmDataPerReadB,
-         InBlockCopySubLengths_CHWN,
-         InBlockCopyClusterLengths_CHWN,
-         InBlockCopyDataPerAccess_N,
-         WeiBlockCopySubLengths_CK,
-         WeiBlockCopyClusterLengths_CK,
-         WeiBlockCopyDataPerAccess_K,
-         OutThreadCopyDataPerAccess_N>{};
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        float time =
-            launch_and_time_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-
-    out_khwn_device_buf.FromDevice(out_khwn.mData.data());
-
-    // reorder output
-    auto f_reorder_khwn2nkhw = [&](auto k, auto ho, auto wo, auto n) {
-        out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_khwn2nkhw, K, Ho, Wo, N)(
-        std::thread::hardware_concurrency());
-}