Ported xdlops kernels to debug bwdwrw fp32/fp16/bfp16 issue. Verified atleast fwd data fp32 works.

32850b93 · Wen-Heng (Jack) Chung · 583755a7 · 32850b93 · 32850b93 · 32850b93
Commit 32850b93 authored Oct 09, 2019 by Wen-Heng (Jack) Chung
17 changed files
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
 #ifndef CK_COMMON_HEADER_HPP
 #define CK_COMMON_HEADER_HPP
-#define MIOPEN_USE_FP16 1
+#define MIOPEN_USE_FP16 0
 #define MIOPEN_USE_BFP16 0
-#define MIOPEN_USE_FP32 0
+#define MIOPEN_USE_FP32 1
 #define __HIP_PLATFORM_HCC__ 1

--- a/composable_kernel/include/utility/config.hpp.bkup
+++ b/composable_kernel/include/utility/config.hpp.bkup
+#ifndef CK_CONFIG_AMD_HPP
+#define CK_CONFIG_AMD_HPP
+#if 0
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+#endif 
+#include "bfloat16_dev.hpp"
+#define CK_DEVICE_BACKEND_AMD 1
+#define CK_USE_AMD_INLINE_ASM 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
+#ifndef CK_USE_INLINE_ASM_XDLOPS
+#define CK_USE_INLINE_ASM_XDLOPS 0
+#endif
+namespace ck {
+// float
+// For some reason, HIP compiler need this definition to generate optimal load and store
+// instruction
+typedef float float32_t __attribute__((ext_vector_type(32)));
+typedef float float2_t __attribute__((ext_vector_type(2)));
+typedef float float4_t __attribute__((ext_vector_type(4)));
+typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
+typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
+typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
+// half
+typedef half2 half2_t;
+// index_t: used for index calculation
+using index_t = uint32_t;
+// data type conversion
+template <class T>
+struct type_convert
+{
+    template <class X>
+    __device__ T operator()(X x) const
+    {
+        return static_cast<T>(x);
+    }
+};
+template <>
+template <>
+__device__ float type_convert<float>::operator()<ushort>(ushort x) const
+{
+    return bfloat16_to_float(x);
+}
+template <>
+template <>
+__device__ ushort type_convert<ushort>::operator()<float>(float x) const
+{
+    return float_to_bfloat16(x);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -4,6 +4,9 @@
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
+#include "bfloat16_dev.hpp"
 #define CK_DEVICE_BACKEND_AMD 1
 #define CK_USE_AMD_INLINE_ASM 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
@@ -11,11 +14,22 @@
 namespace ck {
+// float
 // For some reason, HIP compiler need this definition to generate optimal load and store
 // instruction
+typedef float float32_t __attribute__((ext_vector_type(32)));
 typedef float float2_t __attribute__((ext_vector_type(2)));
 typedef float float4_t __attribute__((ext_vector_type(4)));
+typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
+typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
+typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
+// half
+typedef half2 half2_t;
+// index_t: used for index calculation
 using index_t = uint32_t;
 __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)

--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -6,8 +6,11 @@
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
+#include "bfloat16_dev.hpp"
 #define CK_DEVICE_BACKEND_NVIDIA 1
 #define CK_USE_AMD_INLINE_ASM 0
+#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
@@ -22,6 +25,12 @@ using float4_t = float4;
 using index_t = uint32_t;
+using half2_t = half2;
+typedef struct 
+{
+    half2 value[2];
+} half4_t;
 __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
 {
    d += s0 * s1;
@@ -51,6 +60,31 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i
 }
 #endif
+// data type conversion
+template <class T>
+struct type_convert
+{
+    template <class X>
+    __device__ T operator()(X x) const
+    {
+        return static_cast<T>(x);
+    }
+};
+template <>
+template <>
+__device__ float type_convert<float>::operator()<ushort>(ushort x) const
+{
+    return bfloat16_to_float(x);
+}
+template <>
+template <>
+__device__ ushort type_convert<ushort>::operator()<float>(float x) const
+{
+    return float_to_bfloat16(x);
+}
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/functional2.hpp
+++ b/composable_kernel/include/utility/functional2.hpp
@@ -23,14 +23,16 @@ struct static_for_impl<Sequence<Is...>>
 template <index_t NBegin, index_t NEnd, index_t Increment>
 struct static_for
 {
-    template <class F>
+    __host__ __device__ constexpr static_for()
-    __host__ __device__ constexpr void operator()(F f) const
    {
        static_assert(NBegin <= NEnd, "wrongs! should have NBegin <= NEnd");
        static_assert((NEnd - NBegin) % Increment == 0,
                      "Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
+    }
+    template <class F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
        static_for_impl<typename arithmetic_sequence_gen<NBegin, NEnd, Increment>::type>{}(f);
    }
 };

--- a/composable_kernel/include/utility/functional3.hpp
+++ b/composable_kernel/include/utility/functional3.hpp
@@ -8,106 +8,138 @@
 namespace ck {
+template <class>
+struct is_static : integral_constant<bool, false>
+{
+};
+template <class T, T X>
+struct is_static<integral_constant<T, X>> : integral_constant<bool, true>
+{
+};
+template <index_t... Is>
+struct is_static<Sequence<Is...>> : integral_constant<bool, true>
+{
+};
 // RemainLengths: Sequence<...>
-template <class RemainLengths>
+// Orders: Sequence<...>
+template <class RemainLengths, class Orders>
 struct static_ford_impl
 {
-    // F signature: F(Sequence<...> multi_id)
+    __host__ __device__ constexpr static_ford_impl()
-    // CurrentMultiIndex: Sequence<...>
-    template <class F, class CurrentMultiIndex>
-    __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const
    {
        static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
+    }
+    // F signature: F(Sequence<...>)
+    // CurrentOrderedId: Sequence<...>
+    template <class F, class CurrentOrderedId>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId) const
+    {
        static_for<0, RemainLengths::Front(), 1>{}([=](auto I) {
-            static_ford_impl<decltype(RemainLengths::PopFront())>{}(f,
+            static_ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
-                                                                    CurrentMultiIndex::PushBack(I));
+                f, CurrentOrderedId::PushBack(I));
        });
    }
 };
-template <>
+template <class Orders>
-struct static_ford_impl<Sequence<>>
+struct static_ford_impl<Sequence<>, Orders>
 {
-    // F signature: F(Sequence<...> multi_id)
+    // F signature: F(Sequence<...>)
-    // CurrentMultiIndex: Sequence<...>
+    // OrderedId: Sequence<...>
-    template <class F, class CurrentMultiIndex>
+    template <class F, class OrderedId>
-    __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const
+    __host__ __device__ constexpr void operator()(F f, OrderedId) const
    {
-        f(CurrentMultiIndex{});
+        // retrive unordered Id
+        f(OrderedId::ReorderGivenOld2New(Orders{}));
    }
 };
-// Lengths is Sequence<...>
+// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
-template <class Lengths>
+// Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each
+// dimension
+template <class Lengths,
+          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
 struct static_ford
 {
+    __host__ __device__ constexpr static_ford()
+    {
+        static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
+    }
    // F signature: F(Sequence<...> multi_id)
+    // multi_id is the unordered multi-index
    template <class F>
    __host__ __device__ constexpr void operator()(F f) const
    {
-        static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
+        constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
+        static_ford_impl<decltype(ordered_lengths), Orders>{}(f, Sequence<>{});
-        static_ford_impl<Lengths>{}(f, Sequence<>{});
    }
 };
-template <index_t RemainDim>
+// RemainLengths: Sequence<...>
+// Orders: Sequence<...>
+template <class RemainLengths, class Orders>
 struct ford_impl
 {
-    // F signature: F(Array<...> multi_id)
+    __host__ __device__ constexpr ford_impl()
-    // CurrentMultiIndex: Array<...>
-    // RemainLengths: Sequence<...>
-    template <class F, class CurrentMultiIndex, class RemainLengths>
-    __host__ __device__ constexpr void
-    operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const
    {
-        static_assert(RemainLengths::GetSize() == RemainDim, "wrong!");
+        static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
-        static_assert(RemainDim > 1, "wrong!");
+    }
-        constexpr auto next_length = RemainLengths{}.Front();
-        for(index_t i = 0; i < next_length; ++i)
+    // F signature: F(Array<...> multi_id)
+    // CurrentOrderdId: Array<...>
+    template <class F, class CurrentOrderedId>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
+    {
+        for(index_t i = 0; i < RemainLengths::Front(); ++i)
        {
-            ford_impl<RemainDim - 1>{}(f, current_multi_id.PushBack(i), RemainLengths{}.PopFront());
+            ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
+                f, current_ordered_id.PushBack(i));
        }
    }
 };
-template <>
+template <class Orders>
-struct ford_impl<1>
+struct ford_impl<Sequence<>, Orders>
 {
    // F signature: F(Array<...> multi_id)
-    // CurrentMultiIndex: Array<...>
+    // CurrentOrderdId: Array<...>
-    // RemainLengths: Sequence<...>
+    template <class F, class CurrentOrderedId>
-    template <class F, class CurrentMultiIndex, class RemainLengths>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
-    __host__ __device__ constexpr void
-    operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const
    {
-        static_assert(RemainLengths::GetSize() == 1, "wrong!");
+        // retrive unordered Id
+        f(reorder_array_given_old2new(current_ordered_id, Orders{}));
-        constexpr index_t last_length = RemainLengths{}.Front();
-        for(index_t i = 0; i < last_length; ++i)
-        {
-            f(current_multi_id.PushBack(i));
-        }
    }
 };
-// Lengths is Sequence<...>
+// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
-template <class Lengths>
+// Orders is Sequence<...>, it is the order of dimension in which ford will loop over each
+// dimension
+template <class Lengths,
+          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
 struct ford
 {
+    __host__ __device__ constexpr ford()
+    {
+        static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
+    }
    // F signature: F(Array<...> multi_id)
+    // multi_id is the unordered multi-index
    template <class F>
    __host__ __device__ constexpr void operator()(F f) const
    {
-        constexpr index_t first_length = Lengths{}.Front();
+        constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
-        for(index_t i = 0; i < first_length; ++i)
+        for(index_t i = 0; i < ordered_lengths.Front(); ++i)
        {
-            ford_impl<Lengths::GetSize() - 1>{}(f, Array<index_t, 1>{i}, Lengths{}.PopFront());
+            ford_impl<decltype(ordered_lengths.PopFront()), Orders>{}(f, Array<index_t, 1>{i});
        }
    }
 };

--- a/composable_kernel/include/utility/integral_constant.hpp
+++ b/composable_kernel/include/utility/integral_constant.hpp
@@ -13,30 +13,51 @@ struct integral_constant
    __host__ __device__ constexpr value_type operator()() const noexcept { return value; }
 };
-template <class T, T X, T Y>
+template <class X, class Y>
-__host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_constant<T, Y>)
+struct is_same : public integral_constant<bool, false>
 {
-    return integral_constant<T, X + Y>{};
+};
-}
-template <class T, T X, T Y>
+template <class X>
-__host__ __device__ constexpr auto operator*(integral_constant<T, X>, integral_constant<T, Y>)
+struct is_same<X, X> : public integral_constant<bool, true>
 {
-    return integral_constant<T, X * Y>{};
+};
-}
 template <index_t N>
 using Number = integral_constant<index_t, N>;
-template <class X, class Y>
+template <index_t X, index_t Y>
-struct is_same : public integral_constant<bool, false>
+__host__ __device__ constexpr auto operator+(Number<X>, Number<Y>)
 {
-};
+    return Number<X + Y>{};
+}
-template <class X>
+template <index_t X, index_t Y>
-struct is_same<X, X> : public integral_constant<bool, true>
+__host__ __device__ constexpr auto operator-(Number<X>, Number<Y>)
 {
-};
+    static_assert(Y <= X, "wrong!");
+    return Number<X - Y>{};
+}
+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto operator*(Number<X>, Number<Y>)
+{
+    return Number<X * Y>{};
+}
+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto operator/(Number<X>, Number<Y>)
+{
+    static_assert(Y > 0, "wrong!");
+    return Number<X / Y>{};
+}
+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto operator%(Number<X>, Number<Y>)
+{
+    static_assert(Y > 0, "wrong!");
+    return Number<X % Y>{};
+}
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -3,6 +3,7 @@
 #include "config.hpp"
 #include "integral_constant.hpp"
+#include "vector_type.hpp"
 namespace ck {
 namespace math {
@@ -42,20 +43,16 @@ struct integer_divide_ceiler
    }
 };
-template <class T>
+template <class X, class Y>
-__host__ __device__ constexpr T integer_divide_ceil(T a, T b)
+__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
 {
-    static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
+    return (x + y - 1) / y;
-    return (a + b - 1) / b;
 }
-template <class T>
+template <class X, class Y>
-__host__ __device__ constexpr T integer_least_multiple(T a, T b)
+__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
 {
-    static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
+    return y * integer_divide_ceil(x, y);
-    return b * integer_divide_ceil(a, b);
 }
 template <class T>
@@ -102,6 +99,72 @@ __host__ __device__ constexpr T lcm(T x, Ts... xs)
    return max(x, xs...);
 }
+template <class T>
+struct inner_product_with_conversion
+{
+    static constexpr auto convert = type_convert<T>();
+    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
+    __device__ T operator()(const vector_type<half, 2>::MemoryType& a,
+                            const vector_type<half, 2>::MemoryType& b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
+    __device__ T operator()(const vector_type<half, 4>::MemoryType& a,
+                            const vector_type<half, 4>::MemoryType& b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
+    __device__ T operator()(const vector_type<ushort, 2>::MemoryType& a,
+                            const vector_type<ushort, 2>::MemoryType& b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+        return acc;
+    }
+    __device__ T operator()(const vector_type<ushort, 4>::MemoryType& a,
+                            const vector_type<ushort, 4>::MemoryType& b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+        return acc;
+    }
+};
 } // namespace math
 } // namspace ck

--- a/composable_kernel/include/utility/vector_type.hpp
+++ b/composable_kernel/include/utility/vector_type.hpp
 #ifndef CK_VECTOR_TYPE_HPP
 #define CK_VECTOR_TYPE_HPP
+#if 0
+#include "hip/hip_fp16.h"
+else
 #include "cuda_fp16.h"
+#endif 
 #include "config.hpp"
 #include "integral_constant.hpp"
@@ -10,7 +14,10 @@ namespace ck {
 template <class T, index_t N>
 struct vector_type
 {
-    T vector[N];
+    typedef struct
+    {
+        T scalar[N];
+    } MemoryType;
 };
 template <>
@@ -18,8 +25,6 @@ struct vector_type<float, 1>
 {
    using MemoryType = float;
-    __host__ __device__ static constexpr index_t GetSize() { return 1; }
    template <index_t I>
    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
    {
@@ -33,9 +38,7 @@ struct vector_type<float, 2>
 {
    using MemoryType = float2_t;
-    __host__ __device__ static constexpr index_t GetSize() { return 2; }
+    union DataType
-    union Data
    {
        MemoryType vector;
        float scalar[2];
@@ -48,6 +51,13 @@ struct vector_type<float, 2>
        *(reinterpret_cast<float*>(&v) + I) = s;
    }
+    __host__ __device__ static MemoryType Pack(float s0, float s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
 };
 template <>
@@ -70,8 +80,6 @@ struct vector_type<half, 1>
 {
    using MemoryType = half;
-    __host__ __device__ static constexpr index_t GetSize() { return 1; }
    template <index_t I>
    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
    {
@@ -83,16 +91,14 @@ struct vector_type<half, 1>
 template <>
 struct vector_type<half, 2>
 {
-    using MemoryType = half2;
+    using MemoryType = half2_t;
-    union Data
+    union DataType
    {
        MemoryType vector;
        half scalar[2];
    };
-    __host__ __device__ static constexpr index_t GetSize() { return 2; }
    template <index_t I>
    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
    {
@@ -100,17 +106,25 @@ struct vector_type<half, 2>
        *(reinterpret_cast<half*>(&v) + I) = s;
    }
+    __host__ __device__ static MemoryType Pack(half s0, half s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
 };
 template <>
 struct vector_type<half, 4>
 {
-    typedef struct MemoryType
+    using MemoryType = half4_t;
-    {
-        half2 vector[2];
-    } MemoryType;
-    __host__ __device__ static constexpr index_t GetSize() { return 4; }
+    union DataType
+    {
+        MemoryType vector;
+        half scalar[4];
+    };
    template <index_t I>
    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
@@ -118,15 +132,24 @@ struct vector_type<half, 4>
        static_assert(I < 4, "wrong");
        *(reinterpret_cast<half*>(&v) + I) = s;
    }
+    __host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        data.scalar[2] = s2;
+        data.scalar[3] = s3;
+        return data.vector;
+    }
 };
+#if 0
 template <>
 struct vector_type<ushort, 1>
 {
    using MemoryType = ushort;
-    __host__ __device__ static constexpr index_t GetSize() { return 1; }
    template <index_t I>
    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
    {
@@ -138,16 +161,14 @@ struct vector_type<ushort, 1>
 template <>
 struct vector_type<ushort, 2>
 {
-    using MemoryType = ushort2;
+    using MemoryType = ushort2_t;
-    union Data
+    union DataType
    {
        MemoryType vector;
-        half scalar[2];
+        ushort scalar[2];
    };
-    __host__ __device__ static constexpr index_t GetSize() { return 2; }
    template <index_t I>
    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
    {
@@ -155,17 +176,25 @@ struct vector_type<ushort, 2>
        *(reinterpret_cast<ushort*>(&v) + I) = s;
    }
+    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
 };
 template <>
 struct vector_type<ushort, 4>
 {
-    typedef struct MemoryType
+    using MemoryType = ushort4_t;
-    {
-        ushort2 vector[2];
-    } MemoryType;
-    __host__ __device__ static constexpr index_t GetSize() { return 4; }
+    union DataType
+    {
+        MemoryType vector;
+        ushort scalar[4];
+    };
    template <index_t I>
    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
@@ -173,8 +202,20 @@ struct vector_type<ushort, 4>
        static_assert(I < 4, "wrong");
        *(reinterpret_cast<ushort*>(&v) + I) = s;
    }
+    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        data.scalar[2] = s2;
+        data.scalar[3] = s3;
+        return data.vector;
+    }
 };
+#endif 
 } // namespace ck
 #endif
--- a/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer.cpp
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer.hpp"
+#include "float_types.h"
+#include "implicitgemm_params.hpp"
+extern "C" __global__
+    __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer(
+        const FLOAT* const __restrict__ p_in_global,
+        const FLOAT* const __restrict__ p_wei_global,
+        FLOAT* const __restrict__ p_out_global)
+{
+    using namespace ck;
+    // read params: problem decription
+    constexpr index_t N  = CK_PARAM_PROBLEM_N;
+    constexpr index_t K  = CK_PARAM_PROBLEM_K;
+    constexpr index_t C  = CK_PARAM_PROBLEM_C;
+    constexpr index_t Hi = CK_PARAM_PROBLEM_HI;
+    constexpr index_t Wi = CK_PARAM_PROBLEM_WI;
+    constexpr index_t Ho = CK_PARAM_PROBLEM_HO;
+    constexpr index_t Wo = CK_PARAM_PROBLEM_WO;
+    constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
+    constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
+    // read params: tunable params
+    constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
+    constexpr index_t BPerBlock = CK_PARAM_TUNABLE_B_PER_BLOCK;
+    constexpr index_t KPerBlock = CK_PARAM_TUNABLE_K_PER_BLOCK;
+    constexpr index_t CPerBlock = CK_PARAM_TUNABLE_E_PER_BLOCK;
+    // read params: dependent params
+    constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE;
+    constexpr auto in_nchw_desc  = make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
+    constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
+    constexpr auto wei_ck_desc = make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<1, C>{});
+    using ConvStrides = Sequence<ConvStrideH, ConvStrideW>;
+    constexpr index_t GemmMPerThreadSubC = CK_PARAM_GEMM_M_PER_THREAD_SUB_C;
+    constexpr index_t GemmNPerThreadSubC = CK_PARAM_GEMM_N_PER_THREAD_SUB_C;
+    constexpr index_t GemmMLevel0Cluster = CK_PARAM_GEMM_M_LEVEL0_CLUSTER;
+    constexpr index_t GemmNLevel0Cluster = CK_PARAM_GEMM_N_LEVEL0_CLUSTER;
+    constexpr index_t GemmMLevel1Cluster = CK_PARAM_GEMM_M_LEVEL1_CLUSTER;
+    constexpr index_t GemmNLevel1Cluster = CK_PARAM_GEMM_N_LEVEL1_CLUSTER;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = GemmMPerThreadSubC;
+    constexpr index_t GemmDataPerReadB   = GemmNPerThreadSubC;
+    constexpr index_t GemmNRepeat = 2;
+    constexpr index_t N1          = GemmNRepeat;
+    constexpr index_t N2          = GemmNPerThreadSubC;
+    constexpr index_t InBlockCopyClusterLengths_E  = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E;
+    constexpr index_t InBlockCopyClusterLengths_B  = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B;
+    constexpr index_t InBlockCopyClusterLengths_N1 = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N1;
+    constexpr index_t InBlockCopyClusterLengths_N2 = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N2;
+    constexpr index_t InBlockCopySubLengths_E  = CPerBlock / InBlockCopyClusterLengths_E;
+    constexpr index_t InBlockCopySubLengths_B  = BPerBlock / InBlockCopyClusterLengths_B;
+    constexpr index_t InBlockCopySubLengths_N1 = N1 / InBlockCopyClusterLengths_N1;
+    constexpr index_t InBlockCopySubLengths_N2 = N2 / InBlockCopyClusterLengths_N2;
+    using InBlockCopySubLengths_E_N1_B_N2 = Sequence<InBlockCopySubLengths_E,
+                                                     InBlockCopySubLengths_N1,
+                                                     InBlockCopySubLengths_B,
+                                                     InBlockCopySubLengths_N2>;
+    using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<InBlockCopyClusterLengths_E,
+                                                         InBlockCopyClusterLengths_N1,
+                                                         InBlockCopyClusterLengths_B,
+                                                         InBlockCopyClusterLengths_N2>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
+    constexpr index_t InBlockCopySrcDataPerRead_B   = CK_PARAM_IN_BLOCK_COPY_SRC_DATA_PER_READ_B;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = CK_PARAM_IN_BLOCK_COPY_DST_DATA_PER_WRITE_N2;
+    constexpr index_t WeiBlockCopyClusterLengths_E = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E;
+    constexpr index_t WeiBlockCopyClusterLengths_K = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K;
+    constexpr index_t WeiBlockCopySubLengths_E     = CPerBlock / WeiBlockCopyClusterLengths_E;
+    constexpr index_t WeiBlockCopySubLengths_K     = KPerBlock / WeiBlockCopyClusterLengths_K;
+    using WeiBlockCopySubLengths_E_K = Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K>;
+    using WeiBlockCopyClusterLengths_E_K =
+        Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = CK_PARAM_WEI_BLOCK_COPY_SRC_DATE_PER_READ_E;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = CK_PARAM_WEI_BLOCK_COPY_DST_DATE_PER_WRITE_K;
+    constexpr auto gridwise_conv =
+        GridwiseConvolutionImplicitGemm_v4_nchw_kc1x1_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            FLOAT,
+            FLOAT_ACCUM,
+            decltype(in_nchw_desc),
+            decltype(wei_ck_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            static_cast<ImplicitGemmDirection>(CK_PARAM_PROBLEM_DIRECTION),
+            BPerBlock,
+            KPerBlock,
+            CPerBlock,
+            N1,
+            N2,
+            GemmMPerThreadSubC,
+            GemmNPerThreadSubC,
+            GemmMLevel0Cluster,
+            GemmNLevel0Cluster,
+            GemmMLevel1Cluster,
+            GemmNLevel1Cluster,
+            GemmKPerThreadLoop,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            InBlockCopySubLengths_E_N1_B_N2,
+            InBlockCopyClusterLengths_E_N1_B_N2,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            InBlockCopySrcDataPerRead_B,
+            InBlockCopyDstDataPerWrite_N2,
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K>{};
+    gridwise_conv.Run(p_in_global, p_wei_global, p_out_global);
+}
--- a/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.cpp
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+#include "float_types.h"
+extern "C" __global__
+    __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer(
+        const FLOAT* const __restrict__ p_in_global,
+        const FLOAT* const __restrict__ p_wei_global,
+        FLOAT* const __restrict__ p_out_global)
+{
+    using namespace ck;
+    // read params: problem decription
+    constexpr index_t N  = CK_PARAM_PROBLEM_N;
+    constexpr index_t K  = CK_PARAM_PROBLEM_K;
+    constexpr index_t C  = CK_PARAM_PROBLEM_C;
+    constexpr index_t Hi = CK_PARAM_PROBLEM_HI;
+    constexpr index_t Wi = CK_PARAM_PROBLEM_WI;
+    constexpr index_t Ho = CK_PARAM_PROBLEM_HO;
+    constexpr index_t Wo = CK_PARAM_PROBLEM_WO;
+    constexpr index_t Y  = CK_PARAM_PROBLEM_Y;
+    constexpr index_t X  = CK_PARAM_PROBLEM_X;
+    constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
+    constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
+    constexpr index_t ConvDilationH = CK_PARAM_PROBLEM_CONV_DILATION_H;
+    constexpr index_t ConvDilationW = CK_PARAM_PROBLEM_CONV_DILATION_W;
+    // read params: tunable params
+    constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
+    constexpr index_t BPerBlock = CK_PARAM_TUNABLE_B_PER_BLOCK;
+    constexpr index_t KPerBlock = CK_PARAM_TUNABLE_K_PER_BLOCK;
+    constexpr index_t EPerBlock = CK_PARAM_TUNABLE_E_PER_BLOCK;
+    // read params: dependent params
+    constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE;
+// calculate dependent params amd heuristic params
+#if CK_PARAM_PROBLEM_DIRECTION == 2
+    // In the WrW direction the filter is the output, while the output image is the input being
+    // convolved with the (original) input image. This requires that the tensordescriptors be
+    // swapped
+    // To reuse the fwd kernel for this operation we need to swap the n and c dimension of the
+    // input descriptor, the n and k dimension of the output descriptor
+    // This change is necessary so that reduction dimensions are consistent with the requirement
+    // of the wrw convolution when used in a fwd context
+    constexpr auto tmp_in_nchw_desc =
+        make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
+    constexpr auto tmp_wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto tmp_out_nkhw_desc =
+        make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
+    constexpr auto in_nchw_desc = tmp_in_nchw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
+    // wei and out are swapped in the solver
+    constexpr auto wei_kcyx_desc = tmp_out_nkhw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
+    constexpr auto out_nkhw_desc = tmp_wei_kcyx_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
+    constexpr auto dir           = ImplicitGemmDirection::BackwardWeight;
+    // swap stride and dilation
+    using ConvDilations = Sequence<ConvStrideH, ConvStrideW>;
+    using ConvStrides   = Sequence<ConvDilationH, ConvDilationW>;
+#else
+    constexpr auto in_nchw_desc  = make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
+    constexpr auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
+    constexpr auto dir  = ImplicitGemmDirection::ForwardData;
+    using ConvStrides   = Sequence<ConvStrideH, ConvStrideW>;
+    using ConvDilations = Sequence<ConvDilationH, ConvDilationW>;
+#endif // CK_PARAM_PROBLEM_DIRECTION == 2
+    constexpr index_t GemmMPerThreadSubC = CK_PARAM_GEMM_M_PER_THREAD_SUB_C;
+    constexpr index_t GemmNPerThreadSubC = CK_PARAM_GEMM_N_PER_THREAD_SUB_C;
+    constexpr index_t GemmMLevel0Cluster = CK_PARAM_GEMM_M_LEVEL0_CLUSTER;
+    constexpr index_t GemmNLevel0Cluster = CK_PARAM_GEMM_N_LEVEL0_CLUSTER;
+    constexpr index_t GemmMLevel1Cluster = CK_PARAM_GEMM_M_LEVEL1_CLUSTER;
+    constexpr index_t GemmNLevel1Cluster = CK_PARAM_GEMM_N_LEVEL1_CLUSTER;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmNRepeat = CK_PARAM_GEMM_N_REPEAT;
+    constexpr index_t N1          = GemmNRepeat;
+    constexpr index_t N2          = GemmNPerThreadSubC;
+    constexpr index_t InBlockCopyClusterLengths_E  = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E;
+    constexpr index_t InBlockCopyClusterLengths_B  = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B;
+    constexpr index_t InBlockCopyClusterLengths_N1 = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N1;
+    constexpr index_t InBlockCopyClusterLengths_N2 = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N2;
+    constexpr index_t InBlockCopySubLengths_E  = EPerBlock / InBlockCopyClusterLengths_E;
+    constexpr index_t InBlockCopySubLengths_B  = BPerBlock / InBlockCopyClusterLengths_B;
+    constexpr index_t InBlockCopySubLengths_N1 = N1 / InBlockCopyClusterLengths_N1;
+    constexpr index_t InBlockCopySubLengths_N2 = N2 / InBlockCopyClusterLengths_N2;
+    constexpr index_t WeiBlockCopyClusterLengths_E = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E;
+    constexpr index_t WeiBlockCopyClusterLengths_K = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K;
+    constexpr index_t WeiBlockCopySubLengths_E     = EPerBlock / WeiBlockCopyClusterLengths_E;
+    constexpr index_t WeiBlockCopySubLengths_K     = KPerBlock / WeiBlockCopyClusterLengths_K;
+#if MIOPEN_USE_FP32
+    constexpr index_t GemmDataPerReadA = GemmMPerThreadSubC;
+    constexpr index_t GemmDataPerReadB = GemmNPerThreadSubC;
+    using InBlockCopySubLengths_E_N1_B_N2 = Sequence<InBlockCopySubLengths_E,
+                                                     InBlockCopySubLengths_N1,
+                                                     InBlockCopySubLengths_B,
+                                                     InBlockCopySubLengths_N2>;
+    using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<InBlockCopyClusterLengths_E,
+                                                         InBlockCopyClusterLengths_N1,
+                                                         InBlockCopyClusterLengths_B,
+                                                         InBlockCopyClusterLengths_N2>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
+    constexpr index_t InBlockCopySrcDataPerRead_B   = CK_PARAM_IN_BLOCK_COPY_SRC_DATA_PER_READ_B;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = CK_PARAM_IN_BLOCK_COPY_DST_DATA_PER_WRITE_N2;
+    using WeiBlockCopySubLengths_E_K = Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K>;
+    using WeiBlockCopyClusterLengths_E_K =
+        Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = CK_PARAM_WEI_BLOCK_COPY_SRC_DATE_PER_READ_E;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = CK_PARAM_WEI_BLOCK_COPY_DST_DATE_PER_WRITE_K;
+#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
+    constexpr index_t GemmDataPerReadA          = 1;
+    constexpr index_t GemmDataPerReadB          = 1;
+    constexpr index_t EPACK                     = CK_PARAM_EPACK_LENGTH;
+    using InBlockCopySubLengths_E_N1_B_N2_EPACK = Sequence<InBlockCopySubLengths_E,
+                                                           InBlockCopySubLengths_N1,
+                                                           InBlockCopySubLengths_B,
+                                                           InBlockCopySubLengths_N2,
+                                                           EPACK>;
+    using InBlockCopyClusterLengths_E_N1_B_N2_EPACK = Sequence<InBlockCopyClusterLengths_E,
+                                                               InBlockCopyClusterLengths_N1,
+                                                               InBlockCopyClusterLengths_B,
+                                                               InBlockCopyClusterLengths_N2,
+                                                               1>;
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = CK_PARAM_IN_BLOCK_COPY_DST_DATA_PER_WRITE_N2;
+    // EPACK  - E dimension is folded into 2 dimensions E and EPACK
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2, 4>; // [E, N1, N2, B, EPACK]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 3, 2, 4>; // [E, N1, N2, B, EPACK]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3, 4>; // [E, N1, B, N2, EPACK]
+    using WeiBlockCopySubLengths_E_K_EPACK =
+        Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K, EPACK>;
+    using WeiBlockCopyClusterLengths_E_K_EPACK =
+        Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K, 1>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0, 2>; // [K, E, EPACK]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0, 2>; // [K, E, EPACK]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1, 2>; // [E, K, EPACK]
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = CK_PARAM_WEI_BLOCK_COPY_SRC_DATE_PER_READ_E;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#else
+    static_assert(false, "wrong! Only kperblock could be 32/64/128 not supported");
+#endif
+#if MIOPEN_USE_FP32
+    constexpr auto gridwise_conv =
+        GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            FLOAT,
+            FLOAT_ACCUM,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            BPerBlock,
+            KPerBlock,
+            EPerBlock,
+            GemmNRepeat,
+            GemmMPerThreadSubC,
+            GemmNPerThreadSubC,
+            GemmMLevel0Cluster,
+            GemmNLevel0Cluster,
+            GemmMLevel1Cluster,
+            GemmNLevel1Cluster,
+            GemmKPerThreadLoop,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            InBlockCopySubLengths_E_N1_B_N2,
+            InBlockCopyClusterLengths_E_N1_B_N2,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            InBlockCopySrcDataPerRead_B,
+            InBlockCopyDstDataPerWrite_N2,
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K,
+            dir>{};
+#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
+    constexpr auto gridwise_conv =
+        GridwiseConvolutionImplicitGemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            FLOAT,
+            FLOAT_ACCUM,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            BPerBlock,
+            KPerBlock,
+            EPerBlock,
+            GemmNRepeat,
+            EPACK,
+            GemmMPerThreadSubC,
+            GemmNPerThreadSubC,
+            GemmMLevel0Cluster,
+            GemmNLevel0Cluster,
+            GemmMLevel1Cluster,
+            GemmNLevel1Cluster,
+            GemmKPerThreadLoop,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            InBlockCopySubLengths_E_N1_B_N2_EPACK,
+            InBlockCopyClusterLengths_E_N1_B_N2_EPACK,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            InBlockCopySrcDataPerRead_B,
+            InBlockCopyDstDataPerWrite_N2,
+            WeiBlockCopySubLengths_E_K_EPACK,
+            WeiBlockCopyClusterLengths_E_K_EPACK,
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K,
+            dir>{};
+#else
+    static_assert(false, "wrong! Only fp32, fp16 and bfp16 are supported.");
+#endif
+    gridwise_conv.Run(p_in_global, p_wei_global, p_out_global);
+}
--- a/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer.cpp
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer.hpp"
+#include "float_types.h"
+#include "implicitgemm_params.hpp"
+extern "C" __global__
+    __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer(
+        const FLOAT* const __restrict__ p_in_global,
+        const FLOAT* const __restrict__ p_wei_global,
+        FLOAT* const __restrict__ p_out_global)
+{
+    using namespace ck;
+    // read params: problem decription
+    constexpr index_t N  = CK_PARAM_PROBLEM_N;
+    constexpr index_t K  = CK_PARAM_PROBLEM_K;
+    constexpr index_t C  = CK_PARAM_PROBLEM_C;
+    constexpr index_t Hi = CK_PARAM_PROBLEM_HI;
+    constexpr index_t Wi = CK_PARAM_PROBLEM_WI;
+    constexpr index_t Ho = CK_PARAM_PROBLEM_HO;
+    constexpr index_t Wo = CK_PARAM_PROBLEM_WO;
+    constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
+    constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
+    // read params: tunable params
+    constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
+    constexpr index_t BPerBlock = CK_PARAM_TUNABLE_B_PER_BLOCK;
+    constexpr index_t KPerBlock = CK_PARAM_TUNABLE_K_PER_BLOCK;
+    constexpr index_t EPerBlock = CK_PARAM_TUNABLE_E_PER_BLOCK;
+    // read params: dependent params
+    constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE;
+    // calculate dependent params amd heuristic params
+    constexpr auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
+    constexpr auto wei_ck_desc  = make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<1, C>{});
+    constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
+    using ConvStrides = Sequence<ConvStrideH, ConvStrideW>;
+    constexpr index_t InBlockCopyClusterLengths_E = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E;
+    constexpr index_t InBlockCopyClusterLengths_B = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B;
+    constexpr index_t InBlockCopySubLengths_E = EPerBlock / InBlockCopyClusterLengths_E;
+    constexpr index_t InBlockCopySubLengths_B = BPerBlock / InBlockCopyClusterLengths_B;
+    constexpr index_t WeiBlockCopyClusterLengths_E = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E;
+    constexpr index_t WeiBlockCopyClusterLengths_K = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K;
+    constexpr index_t WeiBlockCopySubLengths_E = EPerBlock / WeiBlockCopyClusterLengths_E;
+    constexpr index_t WeiBlockCopySubLengths_K = KPerBlock / WeiBlockCopyClusterLengths_K;
+    using InBlockCopySubLengths_E_B = Sequence<InBlockCopySubLengths_E, InBlockCopySubLengths_B>;
+    using InBlockCopyClusterLengths_E_B =
+        Sequence<InBlockCopyClusterLengths_E, InBlockCopyClusterLengths_B>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+    constexpr index_t InBlockCopyDataPerAccess_B = CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B;
+    using WeiBlockCopySubLengths_E_K = Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K>;
+    using WeiBlockCopyClusterLengths_E_K =
+        Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K;
+    constexpr index_t OutThreadCopyDataPerAccess_B = CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B;
+    constexpr auto GemmMPerWave     = CK_PARAM_GEMM_M_PER_WAVE;
+    constexpr auto GemmNPerWave     = CK_PARAM_GEMM_N_PER_WAVE;
+    constexpr auto GemmMWaves       = KPerBlock / GemmMPerWave;
+    constexpr auto GemmNWaves       = BPerBlock / GemmNPerWave;
+    constexpr auto GemmDataPerReadA = 1;
+    constexpr auto GemmDataPerReadB = 1;
+    constexpr auto EnableXdlops     = CK_ENABLE_XDLOPS == 1;
+    constexpr auto gridwise_conv =
+        GridwiseConvolutionImplicitGemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            FLOAT,
+            decltype(in_nchw_desc),
+            decltype(wei_ck_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            static_cast<ImplicitGemmDirection>(CK_PARAM_PROBLEM_DIRECTION),
+            BPerBlock,
+            KPerBlock,
+            EPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmMWaves,
+            GemmNWaves,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            EnableXdlops,
+            InBlockCopySubLengths_E_B,
+            InBlockCopyClusterLengths_E_B,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            InBlockCopyDataPerAccess_B,
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K,
+            OutThreadCopyDataPerAccess_B>{};
+    gridwise_conv.Run(p_in_global, p_wei_global, p_out_global);
+}
--- a/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+#include "float_types.h"
+extern "C" __global__
+    __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer(
+        const FLOAT* const __restrict__ p_in_global,
+        const FLOAT* const __restrict__ p_wei_global,
+        FLOAT* const __restrict__ p_out_global)
+{
+    using namespace ck;
+    // read params: problem decription
+    constexpr index_t N  = CK_PARAM_PROBLEM_N;
+    constexpr index_t K  = CK_PARAM_PROBLEM_K;
+    constexpr index_t C  = CK_PARAM_PROBLEM_C;
+    constexpr index_t Hi = CK_PARAM_PROBLEM_HI;
+    constexpr index_t Wi = CK_PARAM_PROBLEM_WI;
+    constexpr index_t Ho = CK_PARAM_PROBLEM_HO;
+    constexpr index_t Wo = CK_PARAM_PROBLEM_WO;
+    constexpr index_t Y  = CK_PARAM_PROBLEM_Y;
+    constexpr index_t X  = CK_PARAM_PROBLEM_X;
+    constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
+    constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
+    constexpr index_t ConvDilationH = CK_PARAM_PROBLEM_CONV_DILATION_H;
+    constexpr index_t ConvDilationW = CK_PARAM_PROBLEM_CONV_DILATION_W;
+    // read params: tunable params
+    constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
+    constexpr index_t BPerBlock = CK_PARAM_TUNABLE_B_PER_BLOCK;
+    constexpr index_t KPerBlock = CK_PARAM_TUNABLE_K_PER_BLOCK;
+    constexpr index_t EPerBlock = CK_PARAM_TUNABLE_E_PER_BLOCK;
+    // read params: dependent params
+    constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE;
+// calculate dependent params amd heuristic params
+#if CK_PARAM_PROBLEM_DIRECTION == 2
+    // In the WrW direction the filter is the output, while the output image is the input being
+    // convolved with the (original) input image. This requires that the tensordescriptors be
+    // swapped
+    // To reuse the fwd kernel for this operation we need to swap the n and c dimension of the
+    // input descriptor, the n and k dimension of the output descriptor
+    // This change is necessary so that reduction dimensions are consistent with the requirement
+    // of the wrw convolution when used in a fwd context
+    constexpr auto tmp_in_nchw_desc =
+        make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
+    constexpr auto tmp_wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto tmp_out_nkhw_desc =
+        make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
+    constexpr auto in_nchw_desc = tmp_in_nchw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
+    // wei and out are swapped in the solver
+    constexpr auto wei_kcyx_desc = tmp_out_nkhw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
+    constexpr auto out_nkhw_desc = tmp_wei_kcyx_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
+    constexpr auto dir           = ImplicitGemmDirection::BackwardWeight;
+    // swap stride and dilation
+    using ConvDilations = Sequence<ConvStrideH, ConvStrideW>;
+    using ConvStrides   = Sequence<ConvDilationH, ConvDilationW>;
+#else
+    // calculate dependent params amd heuristic params
+    constexpr auto in_nchw_desc  = make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
+    constexpr auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
+    constexpr auto dir  = ImplicitGemmDirection::ForwardData;
+    using ConvStrides   = Sequence<ConvStrideH, ConvStrideW>;
+    using ConvDilations = Sequence<ConvDilationH, ConvDilationW>;
+#endif // CK_PARAM_PROBLEM_DIRECTION == 2
+    constexpr index_t InBlockCopyClusterLengths_E = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E;
+    constexpr index_t InBlockCopyClusterLengths_B = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B;
+    constexpr index_t InBlockCopySubLengths_E = EPerBlock / InBlockCopyClusterLengths_E;
+    constexpr index_t InBlockCopySubLengths_B = BPerBlock / InBlockCopyClusterLengths_B;
+    constexpr index_t WeiBlockCopyClusterLengths_E = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E;
+    constexpr index_t WeiBlockCopyClusterLengths_K = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K;
+    constexpr index_t WeiBlockCopySubLengths_E = EPerBlock / WeiBlockCopyClusterLengths_E;
+    constexpr index_t WeiBlockCopySubLengths_K = KPerBlock / WeiBlockCopyClusterLengths_K;
+    constexpr index_t EPack = CK_PARAM_EPACK_LENGTH;
+#if MIOPEN_USE_FP32
+    using InBlockCopySubLengths_E_B = Sequence<InBlockCopySubLengths_E, InBlockCopySubLengths_B>;
+    using InBlockCopyClusterLengths_E_B =
+        Sequence<InBlockCopyClusterLengths_E, InBlockCopyClusterLengths_B>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+    using WeiBlockCopySubLengths_E_K = Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K>;
+    using WeiBlockCopyClusterLengths_E_K =
+        Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
+    using InBlockCopySubLengths_E_B =
+        Sequence<InBlockCopySubLengths_E, InBlockCopySubLengths_B, EPack>;
+    using InBlockCopyClusterLengths_E_B =
+        Sequence<InBlockCopyClusterLengths_E, InBlockCopyClusterLengths_B, 1>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 2>; // [E, B, EPack]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 2>; // [E, B, EPack]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2>; // [E, B, EPack]
+    using WeiBlockCopySubLengths_E_K =
+        Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K, EPack>;
+    using WeiBlockCopyClusterLengths_E_K =
+        Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K, 1>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0, 2>; // [K, E, EPack]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0, 2>; // [K, E, EPack]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1, 2>; // [E, K, EPack]
+#endif
+    constexpr index_t InBlockCopyDataPerAccess_B    = CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B;
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K;
+    constexpr index_t OutThreadCopyDataPerAccess_B  = CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B;
+    constexpr auto GemmMPerWave        = CK_PARAM_GEMM_M_PER_WAVE;
+    constexpr auto GemmNPerWave        = CK_PARAM_GEMM_N_PER_WAVE;
+    constexpr auto GemmMWaves          = KPerBlock / GemmMPerWave;
+    constexpr auto GemmNWaves          = BPerBlock / GemmNPerWave;
+    constexpr index_t GemmDataPerReadA = 1;
+    constexpr index_t GemmDataPerReadB = 1;
+    constexpr auto gridwise_conv =
+#if MIOPEN_USE_FP32
+        GridwiseConvolutionImplicitGemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            FLOAT,
+            FLOAT_ACCUM,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            BPerBlock,
+            KPerBlock,
+            EPerBlock,
+            EPack,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmMWaves,
+            GemmNWaves,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            (CK_ENABLE_XDLOPS == 1),
+            InBlockCopySubLengths_E_B,
+            InBlockCopyClusterLengths_E_B,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            InBlockCopyDataPerAccess_B,
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K,
+            OutThreadCopyDataPerAccess_B,
+            dir>{};
+#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
+        GridwiseConvolutionImplicitGemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            FLOAT,
+            FLOAT_ACCUM,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            BPerBlock,
+            KPerBlock,
+            EPerBlock,
+            EPack,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmMWaves,
+            GemmNWaves,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            (CK_ENABLE_XDLOPS == 1),
+            InBlockCopySubLengths_E_B,
+            InBlockCopyClusterLengths_E_B,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            InBlockCopyDataPerAccess_B,
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K,
+            OutThreadCopyDataPerAccess_B,
+            dir>{};
+#else
+        static_assert(false, "wrong! Only fp32, fp16 and bfp16 are supported.");
+#endif
+    gridwise_conv.Run(p_in_global, p_wei_global, p_out_global);
+}
--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
@@ -32,6 +32,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDe
    constexpr auto HO = HI + 1 - Y;
    constexpr auto WO = WI + 1 - X;
+    printf("H0=%d, W0=%d\n", HO, WO);
    return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
 }

--- a/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
 #pragma once
 #include <unistd.h>
-#define MIOPEN_USE_FP16 1
+#define MIOPEN_USE_FP16 0
 #define MIOPEN_USE_BFP16 0
-#define MIOPEN_USE_FP32 0
+#define MIOPEN_USE_FP32 1
 #define __HIP_PLATFORM_HCC__ 1

--- a/driver/include/device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp
+#pragma once
+#include <unistd.h>
+#define MIOPEN_USE_FP16 0
+#define MIOPEN_USE_BFP16 0
+#define MIOPEN_USE_FP32 1
+#define __HIP_PLATFORM_HCC__ 1
+#include "float_types.h"
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
+//#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+#define CK_ENABLE_XDLOPS 0
+#define CK_PARAM_PROBLEM_DIRECTION 0
+#define CK_PARAM_EPACK_LENGTH 1
+#define CK_PARAM_TUNABLE_BLOCK_SIZE 64
+#define CK_PARAM_TUNABLE_K_PER_BLOCK 32
+#define CK_PARAM_TUNABLE_B_PER_BLOCK 64
+#define CK_PARAM_TUNABLE_E_PER_BLOCK 8
+#define CK_PARAM_DEPENDENT_GRID_SIZE 16
+#define CK_PARAM_GEMM_M_PER_WAVE 32
+#define CK_PARAM_GEMM_N_PER_WAVE 64
+#define CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E 8
+#define CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B 8
+#define CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E 4
+#define CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K 16
+#define CK_PARAM_PROBLEM_CONV_DILATION_W  1
+#define CK_PARAM_PROBLEM_CONV_DILATION_H 1
+#define CK_PARAM_PROBLEM_CONV_STRIDE_H 1
+#define CK_PARAM_PROBLEM_CONV_STRIDE_W 1
+#define CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B 1
+#define CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E 2
+#define CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K 2
+#define CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B 1
+using namespace ck;
+template <class T,
+          class InDesc,
+          class WeiDesc,
+          class OutDesc,
+          class ConvStrides,
+          class ConvDilations>
+void device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw(InDesc,
+                                                        const Tensor<T>& in_nchw,
+                                                        WeiDesc,
+                                                        const Tensor<T>& wei_kcyx,
+                                                        OutDesc,
+                                                        Tensor<T>& out_nkhw,
+                                                        ConvStrides,
+                                                        ConvDilations,
+                                                        index_t nrepeat)
+{
+    using namespace ck;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto in_nchw_desc_org  = InDesc{};
+    constexpr auto wei_kcyx_desc_org = WeiDesc{};
+    constexpr auto out_nkhw_desc_org = OutDesc{};
+    constexpr index_t Hi = in_nchw_desc_org.GetLength(I2);
+    constexpr index_t Wi = in_nchw_desc_org.GetLength(I3);
+    constexpr index_t N  = out_nkhw_desc_org.GetLength(I0);
+    constexpr index_t Ho = out_nkhw_desc_org.GetLength(I2);
+    constexpr index_t Wo = out_nkhw_desc_org.GetLength(I3);
+    constexpr index_t K = wei_kcyx_desc_org.GetLength(I0);
+    constexpr index_t C = wei_kcyx_desc_org.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_desc_org.GetLength(I2);
+    constexpr index_t X = wei_kcyx_desc_org.GetLength(I3);
+    constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
+    constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
+    constexpr index_t ConvDilationH = CK_PARAM_PROBLEM_CONV_DILATION_H;
+    constexpr index_t ConvDilationW = CK_PARAM_PROBLEM_CONV_DILATION_W;
+    // read params: tunable params
+    constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
+    constexpr index_t BPerBlock = CK_PARAM_TUNABLE_B_PER_BLOCK;
+    constexpr index_t KPerBlock = CK_PARAM_TUNABLE_K_PER_BLOCK;
+    constexpr index_t EPerBlock = CK_PARAM_TUNABLE_E_PER_BLOCK;
+    // read params: dependent params
+    constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE;
+// calculate dependent params amd heuristic params
+#if CK_PARAM_PROBLEM_DIRECTION == 2
+    // In the WrW direction the filter is the output, while the output image is the input being
+    // convolved with the (original) input image. This requires that the tensordescriptors be
+    // swapped
+    // To reuse the fwd kernel for this operation we need to swap the n and c dimension of the
+    // input descriptor, the n and k dimension of the output descriptor
+    // This change is necessary so that reduction dimensions are consistent with the requirement
+    // of the wrw convolution when used in a fwd context
+    printf("backward weight is executed\n");
+    constexpr auto tmp_in_nchw_desc =
+        make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
+    constexpr auto tmp_wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto tmp_out_nkhw_desc =
+        make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
+    constexpr auto in_nchw_desc = tmp_in_nchw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
+    // wei and out are swapped in the solver
+    constexpr auto wei_kcyx_desc = tmp_out_nkhw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
+    constexpr auto out_nkhw_desc = tmp_wei_kcyx_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
+    constexpr auto dir           = ImplicitGemmDirection::BackwardWeight;
+    // swap stride and dilation
+    // using ConvDilations = Sequence<ConvStrideH, ConvStrideW>;
+    // using ConvStrides   = Sequence<ConvDilationH, ConvDilationW>;
+#else
+    printf("forward data is executed\n");
+    // calculate dependent params amd heuristic params
+    constexpr auto in_nchw_desc  = make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
+    constexpr auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
+    constexpr auto dir  = ImplicitGemmDirection::ForwardData;
+    // using ConvStrides   = Sequence<ConvStrideH, ConvStrideW>;
+    // using ConvDilations = Sequence<ConvDilationH, ConvDilationW>;
+#endif // CK_PARAM_PROBLEM_DIRECTION == 2
+    constexpr index_t InBlockCopyClusterLengths_E = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E;
+    constexpr index_t InBlockCopyClusterLengths_B = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B;
+    constexpr index_t InBlockCopySubLengths_E = EPerBlock / InBlockCopyClusterLengths_E;
+    constexpr index_t InBlockCopySubLengths_B = BPerBlock / InBlockCopyClusterLengths_B;
+    constexpr index_t WeiBlockCopyClusterLengths_E = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E;
+    constexpr index_t WeiBlockCopyClusterLengths_K = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K;
+    constexpr index_t WeiBlockCopySubLengths_E = EPerBlock / WeiBlockCopyClusterLengths_E;
+    constexpr index_t WeiBlockCopySubLengths_K = KPerBlock / WeiBlockCopyClusterLengths_K;
+    constexpr index_t EPack = CK_PARAM_EPACK_LENGTH;
+#if MIOPEN_USE_FP32
+    printf("fp32 is executed\n");
+    using InBlockCopySubLengths_E_B = Sequence<InBlockCopySubLengths_E, InBlockCopySubLengths_B>;
+    using InBlockCopyClusterLengths_E_B =
+        Sequence<InBlockCopyClusterLengths_E, InBlockCopyClusterLengths_B>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+    using WeiBlockCopySubLengths_E_K = Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K>;
+    using WeiBlockCopyClusterLengths_E_K =
+        Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
+    using InBlockCopySubLengths_E_B =
+        Sequence<InBlockCopySubLengths_E, InBlockCopySubLengths_B, EPack>;
+    using InBlockCopyClusterLengths_E_B =
+        Sequence<InBlockCopyClusterLengths_E, InBlockCopyClusterLengths_B, 1>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 2>; // [E, B, EPack]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 2>; // [E, B, EPack]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2>; // [E, B, EPack]
+    using WeiBlockCopySubLengths_E_K =
+        Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K, EPack>;
+    using WeiBlockCopyClusterLengths_E_K =
+        Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K, 1>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0, 2>; // [K, E, EPack]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0, 2>; // [K, E, EPack]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1, 2>; // [E, K, EPack]
+#endif
+    constexpr index_t InBlockCopyDataPerAccess_B    = CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B;
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K;
+    constexpr index_t OutThreadCopyDataPerAccess_B  = CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B;
+    constexpr auto GemmMPerWave        = CK_PARAM_GEMM_M_PER_WAVE;
+    constexpr auto GemmNPerWave        = CK_PARAM_GEMM_N_PER_WAVE;
+    constexpr auto GemmMWaves          = KPerBlock / GemmMPerWave;
+    constexpr auto GemmNWaves          = BPerBlock / GemmNPerWave;
+    constexpr index_t GemmDataPerReadA = 1;
+    constexpr index_t GemmDataPerReadB = 1;
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+// #if MIOPEN_USE_FP16 == 1
+//     // ES set to 4 as dot4 operator is supported on fp16 in MI100
+//     constexpr index_t ES = 4;
+// #elif MIOPEN_USE_BFP16 == 1
+//     // ES set to 2 as dot2 operator is supported on bfp16 in MI100
+//     constexpr index_t ES = 2;
+// #else
+// // do nothing
+// #endif
+    // constexpr index_t GridSize =
+    //     ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        constexpr auto gridwise_conv =
+#if MIOPEN_USE_FP32 == 1
+        GridwiseConvolutionImplicitGemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            FLOAT,
+            FLOAT_ACCUM,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            BPerBlock,
+            KPerBlock,
+            EPerBlock,
+            EPack,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmMWaves,
+            GemmNWaves,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            false,
+            InBlockCopySubLengths_E_B,
+            InBlockCopyClusterLengths_E_B,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            InBlockCopyDataPerAccess_B,
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K,
+            OutThreadCopyDataPerAccess_B,
+            dir>{};
+#elif MIOPEN_USE_FP16 == 1 || MIOPEN_USE_BFP16 == 1
+        GridwiseConvolutionImplicitGemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            FLOAT,
+            FLOAT_ACCUM,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            BPerBlock,
+            KPerBlock,
+            EPerBlock,
+            EPack,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmMWaves,
+            GemmNWaves,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            false,
+            InBlockCopySubLengths_E_B,
+            InBlockCopyClusterLengths_E_B,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            InBlockCopyDataPerAccess_B,
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K,
+            OutThreadCopyDataPerAccess_B,
+            dir>{};
+#endif 
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -8,11 +8,12 @@
 #include "device.hpp"
 #include "conv_common.hpp"
 #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
+// #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
-#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
+// #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
+// #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
-#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
+// #include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
+//#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp"
 using namespace ck;
@@ -400,6 +401,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
    float ref_value = 0, result_value = 0;
    for(int i = 0; i < ref.mData.size(); ++i)
    {
+        std::cout << result.mData[i] << " ";
        error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
        if(max_diff < diff)
@@ -410,6 +412,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
        }
    }
+    std::cout << std::endl;
    std::cout << "error: " << error << std::endl;
    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
 }
@@ -803,7 +806,7 @@ int main(int argc, char* argv[])
    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
 #elif 1
-    constexpr index_t N  = 8;
+    constexpr index_t N  = 32;
    constexpr index_t C  = 64;
    constexpr index_t HI = 4;
    constexpr index_t WI = 4;
@@ -830,8 +833,8 @@ int main(int argc, char* argv[])
    ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
    ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
-    using in_data_t  = half;
+    using in_data_t  = float;
-    using out_data_t = half;
+    using out_data_t = float;
    Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
    Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
    Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
@@ -850,7 +853,7 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-#if 0
+#if 1
        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #elif 0
@@ -859,7 +862,7 @@ int main(int argc, char* argv[])
 #elif 0
        in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-#elif 1
+#elif 0
        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
 #elif 0
@@ -883,8 +886,10 @@ int main(int argc, char* argv[])
    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
 #elif 0
    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
-#elif 1
+#elif 0
    device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
+#elif 1
+    device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw
 #endif
    (in_nchw_desc,
     in_nchw,