[MIOpen Downstream] Initial MIOpen integration (#52)

* update online kernel wrapper bundle all descriptors in a tuple * change __CONSTANT__ to CONSTANT * rename * adding tuning * added IsValidCompileParameter * reorginze * adding tunable for fp16 and int8 * fix kernel compile warning and bug fixes * suppress warning about cast CONSTANT (address space 4) pointer * fix building issue

[MIOpen Downstream] Initial MIOpen integration (#52)
* update online kernel wrapper bundle all descriptors in a tuple * change __CONSTANT__ to CONSTANT * rename * adding tuning * added IsValidCompileParameter * reorginze * adding tunable for fp16 and int8 * fix kernel compile warning and bug fixes * suppress warning about cast CONSTANT (address space 4) pointer * fix building issue
f63a23ac · Chao Liu · GitHub · 12649254 · f63a23ac · f63a23ac
Unverified Commit f63a23ac authored Jul 27, 2021 by Chao Liu Committed by GitHub Jul 27, 2021
20 changed files
--- a/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
--- a/composable_kernel/include/utility/amd_dlop.hpp
+++ b/composable_kernel/include/utility/amd_dlop.hpp
 #ifndef CK_AMD_DLOP_HPP
 #define CK_AMD_DLOP_HPP

-#include "float_type.hpp"
+#include "data_type.hpp"

 namespace ck {


--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
 #ifndef CK_AMD_INLINE_ASM_HPP
 #define CK_AMD_INLINE_ASM_HPP

-#include "float_type.hpp"
+#include "data_type.hpp"

 namespace ck {


--- a/composable_kernel/include/utility/amd_llvm_intrinsic.hpp
+++ b/composable_kernel/include/utility/amd_llvm_intrinsic.hpp
 #ifndef CK_AMD_LLVM_INTRINSIC_HPP
 #define CK_AMD_LLVM_INTRINSIC_HPP

-#include "float_type.hpp"
+#include "data_type.hpp"

 namespace ck {

-__device__ int32_t __llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane");
+__device__ int32_t llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane");

 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/amd_xdlops.hpp
+++ b/composable_kernel/include/utility/amd_xdlops.hpp
 #ifndef CK_AMD_XDLOPS_HPP
 #define CK_AMD_XDLOPS_HPP

-#include "float_type.hpp"
+#include "data_type.hpp"

 namespace ck {


--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -7,8 +7,9 @@
 #include "statically_indexed_array.hpp"
 #include "container_element_picker.hpp"
 #include "multi_index.hpp"
+#include "data_type_enum.hpp"
 #include "data_type.hpp"
-#include "float_type.hpp"
+#include "data_type_helper.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"

--- a/composable_kernel/include/utility/config.hpp
+++ b/composable_kernel/include/utility/config.hpp
@@ -8,18 +8,13 @@
 #include "bfloat16_dev.hpp"

 // address space for kernel parameter
-#define __CONSTANT__ __attribute__((address_space(4)))
+#define CONSTANT __attribute__((address_space(4)))

-// device backend
-#define CK_DEVICE_BACKEND_AMD 1
-
-// GPU ID
-#if 0
-#define CK_AMD_GPU_GFX906 1
-#elif 1
-#define CK_AMD_GPU_GFX908 1
-#elif 0
-#define CK_AMD_GPU_GFX1030 1
+// GPU target
+// should enable one and only one GPU target
+#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
+      defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
+#error Need to define a single GPU target
 #endif

 // HIP version
@@ -36,7 +31,8 @@
 #endif

 // buffer resourse
-#if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
+#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
+    defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A)
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
 #elif defined(CK_AMD_GPU_GFX1030)
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
@@ -50,10 +46,6 @@
 #define CK_USE_AMD_INLINE_ASM 1
 #endif

-#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
-#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
-#endif
-
 // AMD DLOPS
 #ifndef CK_USE_AMD_DLOP
 #define CK_USE_AMD_DLOP 1
@@ -78,14 +70,6 @@
 #define CK_USE_AMD_XDLOPS 0
 #endif

-#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
-#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
-#endif
-
-#ifndef CK_USE_AMD_XDLOPS_EMULATE
-#define CK_USE_AMD_XDLOPS_EMULATE 0 // For internal debug purposes
-#endif
-
 // block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
 #ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
 #define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
@@ -104,18 +88,6 @@
 #define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK 1
 #endif

-#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
-#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
-#endif
-
-#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
-#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
-#endif
-
-#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK
-#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
-#endif
-
 // pass tensor descriptor by value or void*
 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 0
 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1
@@ -131,17 +103,6 @@
 #define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
 #endif

-// workaround: put all workaround here
-// workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
-#ifndef CK_WORKAROUND_SWDEV_229564
-#define CK_WORKAROUND_SWDEV_229564 1
-#endif
-
-// workaround for accvgpr over-allocation
-#ifndef CK_WORKAROUND_SWDEV_241664
-#define CK_WORKAROUND_SWDEV_241664 1
-#endif
-
 // workaround for compiler crash when compiling recursive lambda
 #ifndef CK_WORKAROUND_SWDEV_275126
 #define CK_WORKAROUND_SWDEV_275126 1
@@ -159,7 +120,7 @@

 namespace ck {

-enum AddressSpace
+enum AddressSpaceEnum_t
 {
    Generic,
    Global,
@@ -168,7 +129,7 @@ enum AddressSpace
    Vgpr
 };

-enum InMemoryDataOperation
+enum InMemoryDataOperationEnum_t
 {
    Set,
    AtomicAdd

--- a/composable_kernel/include/utility/data_type.hpp
+++ b/composable_kernel/include/utility/data_type.hpp
--- a/composable_kernel/include/utility/data_type_enum.hpp
+++ b/composable_kernel/include/utility/data_type_enum.hpp
+#ifndef CK_DATA_TYPE_ENUM_HPP
+#define CK_DATA_TYPE_ENUM_HPP
+
+namespace ck {
+
+// this enumerate should be synchronized with include/miopen.h
+typedef enum {
+    Half     = 0,
+    Float    = 1,
+    Int32    = 2,
+    Int8     = 3,
+    Int8x4   = 4,
+    BFloat16 = 5,
+    Double   = 6,
+    Unknown  = 100,
+} DataTypeEnum_t;
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/data_type_helper.hpp
+++ b/composable_kernel/include/utility/data_type_helper.hpp
+#ifndef CK_DATA_TYPE_HELPER_HPP
+#define CK_DATA_TYPE_HELPER_HPP
+
+#include "data_type.hpp"
+#include "data_type_enum.hpp"
+
+namespace ck {
+
+template <DataTypeEnum_t DataTypeEnum>
+struct get_datatype_from_enum;
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum_t::Int8>
+{
+    using type = int8_t;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum_t::Int32>
+{
+    using type = int32_t;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum_t::Half>
+{
+    using type = half_t;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum_t::Float>
+{
+    using type = float;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum_t::Double>
+{
+    using type = double;
+};
+
+template <typename T>
+struct get_datatype_enum_from_type;
+
+template <>
+struct get_datatype_enum_from_type<int8_t>
+{
+    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int8;
+};
+
+template <>
+struct get_datatype_enum_from_type<int32_t>
+{
+    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int32;
+};
+
+template <>
+struct get_datatype_enum_from_type<half_t>
+{
+    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Half;
+};
+
+template <>
+struct get_datatype_enum_from_type<float>
+{
+    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Float;
+};
+
+template <>
+struct get_datatype_enum_from_type<double>
+{
+    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Double;
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -5,7 +5,7 @@ namespace ck {

 #include "amd_buffer_addressing_v2.hpp"

-template <AddressSpace BufferAddressSpace, typename T, typename ElementSpaceSize>
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
 struct DynamicBuffer
 {
    using type = T;
@@ -18,7 +18,7 @@ struct DynamicBuffer
    {
    }

-    __host__ __device__ static constexpr AddressSpace GetAddressSpace()
+    __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
    {
        return BufferAddressSpace;
    }
@@ -32,7 +32,7 @@ struct DynamicBuffer
                  is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
                          typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type>::value,
                  bool>::type = false>
-    __host__ __device__ constexpr const auto Get(index_t i, bool is_valid_offset) const
+    __host__ __device__ constexpr auto Get(index_t i, bool is_valid_offset) const
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector =
@@ -46,7 +46,7 @@ struct DynamicBuffer

        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;

-        if constexpr(GetAddressSpace() == AddressSpace::Global)
+        if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
        {
 #if CK_USE_AMD_BUFFER_ADDRESSING
            return amd_buffer_load_v2<remove_cv_t<remove_reference_t<T>>, t_per_x>(
@@ -80,7 +80,7 @@ struct DynamicBuffer

        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;

-        if constexpr(GetAddressSpace() == AddressSpace::Global)
+        if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
        {
 #if CK_USE_AMD_BUFFER_ADDRESSING
            amd_buffer_store_v2<remove_cv_t<remove_reference_t<T>>, t_per_x>(
@@ -92,14 +92,15 @@ struct DynamicBuffer
            }
 #endif
        }
-        else if constexpr(GetAddressSpace() == AddressSpace::Lds)
+        else if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Lds)
        {
            if(is_valid_offset)
            {
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
                *reinterpret_cast<X*>(&p_data_[i]) = x;
 #else
-                // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
+                // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
+                // inefficient
                // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
                // ds_write_b128
                // TODO: remove this after compiler fix
@@ -119,7 +120,8 @@ struct DynamicBuffer
                             is_same<remove_cv_t<remove_reference_t<X>>, int8x8_t>::value) ||
                            (is_same<remove_cv_t<remove_reference_t<T>>, int8x16_t>::value &&
                             is_same<remove_cv_t<remove_reference_t<X>>, int8x16_t>::value),
-                        "wrong! not implemented for this combination, please add implementation");
+                        "wrong! not implemented for this combination, please add "
+                        "implementation");

                    if constexpr(is_same<remove_cv_t<remove_reference_t<T>>, int8_t>::value &&
                                 is_same<remove_cv_t<remove_reference_t<X>>, int8_t>::value)
@@ -194,7 +196,7 @@ struct DynamicBuffer
    __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
 };

-template <AddressSpace BufferAddressSpace = AddressSpace::Generic,
+template <AddressSpaceEnum_t BufferAddressSpace = AddressSpaceEnum_t::Generic,
          typename T,
          typename ElementSpaceSize>
 __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)

--- a/composable_kernel/include/utility/float_type.hpp
+++ b/composable_kernel/include/utility/float_type.hpp
--- a/composable_kernel/include/utility/magic_division.hpp
+++ b/composable_kernel/include/utility/magic_division.hpp
@@ -127,7 +127,8 @@ struct MagicDivision
    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
    {
        uint32_t dividend_u32 = as_type<uint32_t>(dividend_i32);
-        uint32_t tmp          = ((uint64_t)dividend_u32 * (uint64_t)multiplier) >> 32;
+        uint32_t tmp =
+            (static_cast<uint64_t>(dividend_u32) * static_cast<uint64_t>(multiplier)) >> 32;
        return (tmp + dividend_u32) >> shift;
    }
 #else

--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -150,7 +150,15 @@ __host__ __device__ constexpr auto min(X x, Ys... ys)
 // greatest common divisor, aka highest common factor
 __host__ __device__ constexpr index_t gcd(index_t x, index_t y)
 {
-    if(x == y || x == 0)
+    if(x < 0)
+    {
+        return gcd(-x, y);
+    }
+    else if(y < 0)
+    {
+        return gcd(x, -y);
+    }
+    else if(x == y || x == 0)
    {
        return y;
    }
@@ -160,11 +168,11 @@ __host__ __device__ constexpr index_t gcd(index_t x, index_t y)
    }
    else if(x > y)
    {
-        return gcd(x - y, y);
+        return gcd(x % y, y);
    }
    else
    {
-        return gcd(x, y - x);
+        return gcd(x, y % x);
    }
 }

@@ -181,7 +189,7 @@ template <typename X,
          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
 __host__ __device__ constexpr auto gcd(X x, Ys... ys)
 {
-    return gcd(x, ys...);
+    return gcd(x, gcd(ys...));
 }

 // least common multiple

--- a/composable_kernel/include/utility/static_buffer.hpp
+++ b/composable_kernel/include/utility/static_buffer.hpp
@@ -5,7 +5,7 @@

 namespace ck {

-template <AddressSpace BufferAddressSpace, typename T, index_t N>
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, index_t N>
 struct StaticBuffer : public StaticallyIndexedArray<T, N>
 {
    using type = T;
@@ -13,7 +13,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>

    __host__ __device__ constexpr StaticBuffer() : base{} {}

-    __host__ __device__ static constexpr AddressSpace GetAddressSpace()
+    __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
    {
        return BufferAddressSpace;
    }
@@ -23,7 +23,9 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
    __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
 };

-template <AddressSpace BufferAddressSpace = AddressSpace::Generic, typename T, index_t N>
+template <AddressSpaceEnum_t BufferAddressSpace = AddressSpaceEnum_t::Generic,
+          typename T,
+          index_t N>
 __host__ __device__ constexpr auto make_static_buffer(Number<N>)
 {
    return StaticBuffer<BufferAddressSpace, T, N>{};

--- a/composable_kernel/include/utility/synchronization.hpp
+++ b/composable_kernel/include/utility/synchronization.hpp
@@ -5,8 +5,6 @@

 namespace ck {

-__device__ void __llvm_amdgcn_s_barrier() __asm("llvm.amdgcn.s.barrier");
-
 __device__ void block_sync_lds()
 {
 #if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
@@ -15,11 +13,9 @@ __device__ void block_sync_lds()
    s_barrier \
    " ::);
 #else
-    __llvm_amdgcn_s_barrier();
+    __syncthreads();
 #endif
 }

-__device__ void block_sync_lds_vmem() { __llvm_amdgcn_s_barrier(); }
-
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/type_helper.hpp
+++ b/composable_kernel/include/utility/type_helper.hpp
-#ifndef CK_TYPE_HELPER_HPP
-#define CK_TYPE_HELPER_HPP
-
-#include "float_type.hpp"
-
-namespace ck {
-
-template <char tid>
-struct get_type_from_type_id
-{
-    using type = float;
-};
-
-template <>
-struct get_type_from_type_id<'H'>
-{
-    using type = half_t;
-};
-
-template <>
-struct get_type_from_type_id<'F'>
-{
-    using type = float;
-};
-
-template <>
-struct get_type_from_type_id<'D'>
-{
-    using type = double;
-};
-
-} // namespace ck
-
-#endif
--- a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
+++ b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
 #include "common_header.hpp"
-#include "type_helper.hpp"
 #include "dynamic_tensor_descriptor.hpp"
 #include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_v1r2.hpp"
+#include "gridwise_dynamic_gemm_dlops_v1r2.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"

 using namespace ck;

-using FloatAB  = typename get_type_from_type_id<static_cast<char>(CK_PARAM_IN_WEI_DATATYPE)>::type;
-using FloatC   = typename get_type_from_type_id<static_cast<char>(CK_PARAM_OUT_DATATYPE)>::type;
-using FloatAcc = typename get_type_from_type_id<static_cast<char>(CK_PARAM_CONV_COMPTYPE)>::type;
+constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
+constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
+constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
+
+using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
+using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
+using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;

 constexpr index_t BlockSize = CK_PARAM_BlockSize;

@@ -61,7 +64,8 @@ constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDs
 constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HAS_MAIN_KBLOCK_LOOP);
 constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP);

-extern "C" __global__ void dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_prepare(
+extern "C" __global__ void
+dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
    int n,
    int c,
    int hi,
@@ -147,48 +151,48 @@ extern "C" __global__ void dynamic_convolution_forward_implicit_gemm_v4r4_nchw_k
    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;

    using GridwiseGemm =
-        GridwiseDynamicGemm_km_kn_mn_v1r2<BlockSize,
-                                          FloatAB,
-                                          FloatAcc,
-                                          FloatC,
-                                          InMemoryDataOperation::Set, /* ToDo tunable */
-                                          AKMGridDesc,
-                                          BKNGridDesc,
-                                          CMNGridDesc,
-                                          MPerBlock,
-                                          NPerBlock,
-                                          KPerBlock,
-                                          M1PerThread,
-                                          N1PerThread,
-                                          KPerThread,
-                                          M1N1ThreadClusterM10,
-                                          M1N1ThreadClusterN10,
-                                          M1N1ThreadClusterM11,
-                                          M1N1ThreadClusterN11,
-                                          ABlockTransferThreadSliceLengths_K_M0_M1,
-                                          ABlockTransferThreadClusterLengths_K_M0_M1,
-                                          ABlockTransferThreadClusterArrangeOrder,
-                                          ABlockTransferSrcAccessOrder,
-                                          ABlockTransferSrcVectorDim,
-                                          ABlockTransferSrcScalarPerVector,
-                                          ABlockTransferDstScalarPerVector_M1,
-                                          AThreadTransferSrcResetCoordinateAfterRun,
-                                          BBlockTransferThreadSliceLengths_K_N0_N1,
-                                          BBlockTransferThreadClusterLengths_K_N0_N1,
-                                          BBlockTransferThreadClusterArrangeOrder,
-                                          BBlockTransferSrcAccessOrder,
-                                          BBlockTransferSrcVectorDim,
-                                          BBlockTransferSrcScalarPerVector,
-                                          BBlockTransferDstScalarPerVector_N1,
-                                          BThreadTransferSrcResetCoordinateAfterRun,
-                                          CThreadTransferSrcDstAccessOrder,
-                                          CThreadTransferSrcDstVectorDim,
-                                          CThreadTransferDstScalarPerVector,
-                                          AGridIteratorHacks,
-                                          BGridIteratorHacks,
-                                          CGridIteratorHacks,
-                                          AGridMoveSliceWindowIteratorHacks,
-                                          BGridMoveSliceWindowIteratorHacks>;
+        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                               FloatAB,
+                                               FloatAcc,
+                                               FloatC,
+                                               InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
+                                               AKMGridDesc,
+                                               BKNGridDesc,
+                                               CMNGridDesc,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               M1PerThread,
+                                               N1PerThread,
+                                               KPerThread,
+                                               M1N1ThreadClusterM10,
+                                               M1N1ThreadClusterN10,
+                                               M1N1ThreadClusterM11,
+                                               M1N1ThreadClusterN11,
+                                               ABlockTransferThreadSliceLengths_K_M0_M1,
+                                               ABlockTransferThreadClusterLengths_K_M0_M1,
+                                               ABlockTransferThreadClusterArrangeOrder,
+                                               ABlockTransferSrcAccessOrder,
+                                               ABlockTransferSrcVectorDim,
+                                               ABlockTransferSrcScalarPerVector,
+                                               ABlockTransferDstScalarPerVector_M1,
+                                               AThreadTransferSrcResetCoordinateAfterRun,
+                                               BBlockTransferThreadSliceLengths_K_N0_N1,
+                                               BBlockTransferThreadClusterLengths_K_N0_N1,
+                                               BBlockTransferThreadClusterArrangeOrder,
+                                               BBlockTransferSrcAccessOrder,
+                                               BBlockTransferSrcVectorDim,
+                                               BBlockTransferSrcScalarPerVector,
+                                               BBlockTransferDstScalarPerVector_N1,
+                                               BThreadTransferSrcResetCoordinateAfterRun,
+                                               CThreadTransferSrcDstAccessOrder,
+                                               CThreadTransferSrcDstVectorDim,
+                                               CThreadTransferDstScalarPerVector,
+                                               AGridIteratorHacks,
+                                               BGridIteratorHacks,
+                                               CGridIteratorHacks,
+                                               AGridMoveSliceWindowIteratorHacks,
+                                               BGridMoveSliceWindowIteratorHacks>;

    auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
    auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
@@ -212,14 +216,14 @@ extern "C" __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
+        dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
            const FloatAB* __restrict__ p_a_grid,
            const FloatAB* __restrict__ p_b_grid,
            FloatC* __restrict__ p_c_grid,
-            const void __CONSTANT__* p_a_k_m0_m1_grid_desc,
-            const void __CONSTANT__* p_b_k_n0_n1_grid_desc,
-            const void __CONSTANT__* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
-            const void __CONSTANT__* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+            const void CONSTANT* p_a_k_m0_m1_grid_desc,
+            const void CONSTANT* p_b_k_n0_n1_grid_desc,
+            const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -283,48 +287,48 @@ extern "C" __global__ void
    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;

    using GridwiseGemm =
-        GridwiseDynamicGemm_km_kn_mn_v1r2<BlockSize,
-                                          FloatAB,
-                                          FloatAcc,
-                                          FloatC,
-                                          InMemoryDataOperation::Set, /* ToDo tunable */
-                                          AKMGridDesc,
-                                          BKNGridDesc,
-                                          CMNGridDesc,
-                                          MPerBlock,
-                                          NPerBlock,
-                                          KPerBlock,
-                                          M1PerThread,
-                                          N1PerThread,
-                                          KPerThread,
-                                          M1N1ThreadClusterM10,
-                                          M1N1ThreadClusterN10,
-                                          M1N1ThreadClusterM11,
-                                          M1N1ThreadClusterN11,
-                                          ABlockTransferThreadSliceLengths_K_M0_M1,
-                                          ABlockTransferThreadClusterLengths_K_M0_M1,
-                                          ABlockTransferThreadClusterArrangeOrder,
-                                          ABlockTransferSrcAccessOrder,
-                                          ABlockTransferSrcVectorDim,
-                                          ABlockTransferSrcScalarPerVector,
-                                          ABlockTransferDstScalarPerVector_M1,
-                                          AThreadTransferSrcResetCoordinateAfterRun,
-                                          BBlockTransferThreadSliceLengths_K_N0_N1,
-                                          BBlockTransferThreadClusterLengths_K_N0_N1,
-                                          BBlockTransferThreadClusterArrangeOrder,
-                                          BBlockTransferSrcAccessOrder,
-                                          BBlockTransferSrcVectorDim,
-                                          BBlockTransferSrcScalarPerVector,
-                                          BBlockTransferDstScalarPerVector_N1,
-                                          BThreadTransferSrcResetCoordinateAfterRun,
-                                          CThreadTransferSrcDstAccessOrder,
-                                          CThreadTransferSrcDstVectorDim,
-                                          CThreadTransferDstScalarPerVector,
-                                          AGridIteratorHacks,
-                                          BGridIteratorHacks,
-                                          CGridIteratorHacks,
-                                          AGridMoveSliceWindowIteratorHacks,
-                                          BGridMoveSliceWindowIteratorHacks>;
+        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                               FloatAB,
+                                               FloatAcc,
+                                               FloatC,
+                                               InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
+                                               AKMGridDesc,
+                                               BKNGridDesc,
+                                               CMNGridDesc,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               M1PerThread,
+                                               N1PerThread,
+                                               KPerThread,
+                                               M1N1ThreadClusterM10,
+                                               M1N1ThreadClusterN10,
+                                               M1N1ThreadClusterM11,
+                                               M1N1ThreadClusterN11,
+                                               ABlockTransferThreadSliceLengths_K_M0_M1,
+                                               ABlockTransferThreadClusterLengths_K_M0_M1,
+                                               ABlockTransferThreadClusterArrangeOrder,
+                                               ABlockTransferSrcAccessOrder,
+                                               ABlockTransferSrcVectorDim,
+                                               ABlockTransferSrcScalarPerVector,
+                                               ABlockTransferDstScalarPerVector_M1,
+                                               AThreadTransferSrcResetCoordinateAfterRun,
+                                               BBlockTransferThreadSliceLengths_K_N0_N1,
+                                               BBlockTransferThreadClusterLengths_K_N0_N1,
+                                               BBlockTransferThreadClusterArrangeOrder,
+                                               BBlockTransferSrcAccessOrder,
+                                               BBlockTransferSrcVectorDim,
+                                               BBlockTransferSrcScalarPerVector,
+                                               BBlockTransferDstScalarPerVector_N1,
+                                               BThreadTransferSrcResetCoordinateAfterRun,
+                                               CThreadTransferSrcDstAccessOrder,
+                                               CThreadTransferSrcDstVectorDim,
+                                               CThreadTransferDstScalarPerVector,
+                                               AGridIteratorHacks,
+                                               BGridIteratorHacks,
+                                               CGridIteratorHacks,
+                                               AGridMoveSliceWindowIteratorHacks,
+                                               BGridMoveSliceWindowIteratorHacks>;

    constexpr auto a_k_m0_m1_grid_desc_tmp =
        GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);

--- a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
+++ b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
--- a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
+++ b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp