merge dev branch

19bb36d7 · wangshaojie6 · 3e7f7997 · 2e6eaf6e · 19bb36d7 · 19bb36d7
Commit 19bb36d7 authored May 01, 2022 by wangshaojie6
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -11,9 +11,6 @@
 #include "blockwise_tensor_slice_transfer_v6r1.hpp"
 #include "threadwise_tensor_slice_transfer.hpp"
-#define A_BLOCK_BANK_CONFLICT_FREE_WRW 1
-#define B_BLOCK_BANK_CONFLICT_FREE_WRW 1
 namespace ck {
 template <typename GridwiseGemm,
@@ -43,6 +40,7 @@ __global__ void
                                  const CElementwiseOperation c_element_op,
                                  const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
    constexpr index_t shared_block_size =
        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
@@ -59,6 +57,18 @@ __global__ void
                                                  b_element_op,
                                                  c_element_op,
                                                  c_block_cluster_adaptor);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_b_k0_m_k1_grid_desc;
+    ignore = b_b_k0_n_k1_grid_desc;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = c_block_cluster_adaptor;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 template <index_t BlockSize,
@@ -99,7 +109,9 @@ template <index_t BlockSize,
          index_t CShuffleMRepeatPerShuffle,
          index_t CShuffleNRepeatPerShuffle,
          index_t CBlockTransferScalarPerVector_NWaveNPerXDL,
-          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>
+          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          bool ABlockLdsExtraM1Wrw = false,
+          bool BBlockLdsExtraN1Wrw = false>
 struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 {
    static constexpr auto I0 = Number<0>{};
@@ -111,20 +123,22 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
    static constexpr auto I6 = Number<6>{};
    static constexpr auto I7 = Number<7>{};
+    // Bytes per 32 lds bank: 32 * 4 bytes
+    static constexpr auto BankLength = Number<128>{};
+    static constexpr auto ElePerBank = Number<BankLength / sizeof(FloatAB)>{};
    // K1 should be Number<...>
    static constexpr auto K1 = Number<K1Value>{};
-    // M1 & N1
+    // M1 & M0
-    static constexpr auto ElePerBank = Number<64>{};
    static constexpr auto M1PerBlock = Number<ElePerBank / K1Value>{};
-    static constexpr auto N1PerBlock = Number<ElePerBank / K1Value>{};
-    // M0 & N0
    static constexpr auto M0PerBlock = Number<MPerBlock / M1PerBlock>{};
-    static constexpr auto N0PerBlock = Number<NPerBlock / M1PerBlock>{};
+    static constexpr auto M1Padding  = I4;
-    // M1 padding num
+    // N1 & N0
-    static constexpr auto M1Padding = Number<4>{};
+    static constexpr auto N1PerBlock = Number<ElePerBank / K1Value>{};
-    static constexpr auto N1Padding = M1Padding;
+    static constexpr auto N0PerBlock = Number<NPerBlock / M1PerBlock>{};
+    static constexpr auto N1Padding  = I4;
    __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
    {
@@ -134,26 +148,33 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        constexpr auto a_block_desc_k0_m_k1 = [&]() {
            if constexpr(ABlockLdsExtraM)
            {
-#if A_BLOCK_BANK_CONFLICT_FREE_WRW
+                if constexpr(ABlockLdsExtraM1Wrw)
+                {
                    constexpr auto a_block_desc_k0_m0_m1_k1 = make_naive_tensor_descriptor(
-                    make_tuple(Number<K0PerBlock>{}, Number<M0PerBlock>{}, Number<M1PerBlock>{}, K1),
+                        make_tuple(
-                    make_tuple(Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding), Number<M1PerBlock>{} * K1 + M1Padding, K1, I1));
+                            Number<K0PerBlock>{}, Number<M0PerBlock>{}, Number<M1PerBlock>{}, K1),
+                        make_tuple(Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding),
+                                   Number<M1PerBlock>{} * K1 + M1Padding,
+                                   K1,
+                                   I1));
                    constexpr auto a_block_desc_k0_m_k1_tmp = transform_tensor_descriptor(
                        a_block_desc_k0_m0_m1_k1,
                        make_tuple(make_pass_through_transform(Number<K0PerBlock>{}),
-                               make_merge_transform_v3_division_mod(make_tuple(Number<M0PerBlock>{}, Number<M1PerBlock>{})),
+                                   make_merge_transform_v3_division_mod(
+                                       make_tuple(Number<M0PerBlock>{}, Number<M1PerBlock>{})),
                                   make_pass_through_transform(K1)),
                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-                );
                    return a_block_desc_k0_m_k1_tmp;
-#else
+                }
+                else
+                {
                    return make_naive_tensor_descriptor(
                        make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
                        make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
-#endif
+                }
            }
            else
            {
@@ -173,32 +194,48 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        constexpr auto a_block_desc_b_k0_m_k1 = [&]() {
            if constexpr(ABlockLdsExtraM)
            {
-#if A_BLOCK_BANK_CONFLICT_FREE_WRW
+                if constexpr(ABlockLdsExtraM1Wrw)
+                {
                    constexpr auto a_block_desc_b_k0_m0_m1_k1 = make_naive_tensor_descriptor(
-                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<M0PerBlock>{}, Number<M1PerBlock>{}, K1),
+                        make_tuple(Number<1>{},
-                    make_tuple(Number<K0PerBlock>{} * Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding), Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding), Number<M1PerBlock>{} * K1 + M1Padding, K1, I1));
+                                   Number<K0PerBlock>{},
+                                   Number<M0PerBlock>{},
+                                   Number<M1PerBlock>{},
+                                   K1),
+                        make_tuple(Number<K0PerBlock>{} * Number<M0PerBlock>{} *
+                                       (Number<M1PerBlock>{} * K1 + M1Padding),
+                                   Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding),
+                                   Number<M1PerBlock>{} * K1 + M1Padding,
+                                   K1,
+                                   I1));
                    constexpr auto a_block_desc_b_k0_m_k1_tmp = transform_tensor_descriptor(
                        a_block_desc_b_k0_m0_m1_k1,
                        make_tuple(make_pass_through_transform(Number<1>{}),
                                   make_pass_through_transform(Number<K0PerBlock>{}),
-                               make_merge_transform_v3_division_mod_for_wrw(make_tuple(Number<M0PerBlock>{}, Number<M1PerBlock>{})),
+                                   make_merge_transform_v3_division_mod_for_wrw(
+                                       make_tuple(Number<M0PerBlock>{}, Number<M1PerBlock>{})),
                                   make_pass_through_transform(K1)),
                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-                );
                    return a_block_desc_b_k0_m_k1_tmp;
-#else
+                }
+                else
+                {
                    return make_naive_tensor_descriptor(
                        make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
-                    make_tuple(Number<K0PerBlock>{} * Number<MPerBlock + 1>{} * K1, Number<MPerBlock + 1>{} * K1, K1, I1));
+                        make_tuple(Number<K0PerBlock>{} * Number<MPerBlock + 1>{} * K1,
-#endif
+                                   Number<MPerBlock + 1>{} * K1,
+                                   K1,
+                                   I1));
+                }
            }
            else
            {
                return make_naive_tensor_descriptor_aligned(
-                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    max_lds_align);
            }
        }();
@@ -213,27 +250,33 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        constexpr auto b_block_desc_k0_n_k1 = [&]() {
            if constexpr(BBlockLdsExtraN)
            {
-#if B_BLOCK_BANK_CONFLICT_FREE_WRW
+                if constexpr(BBlockLdsExtraN1Wrw)
+                {
                    constexpr auto b_block_desc_k0_n0_n1_k1 = make_naive_tensor_descriptor(
-                    make_tuple(Number<K0PerBlock>{}, Number<N0PerBlock>{}, Number<N1PerBlock>{}, K1),
+                        make_tuple(
-                    make_tuple(Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding), Number<N1PerBlock>{} * K1 + N1Padding, K1, I1));
+                            Number<K0PerBlock>{}, Number<N0PerBlock>{}, Number<N1PerBlock>{}, K1),
+                        make_tuple(Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding),
+                                   Number<N1PerBlock>{} * K1 + N1Padding,
+                                   K1,
+                                   I1));
                    constexpr auto b_block_desc_k0_n_k1_tmp = transform_tensor_descriptor(
                        b_block_desc_k0_n0_n1_k1,
                        make_tuple(make_pass_through_transform(Number<K0PerBlock>{}),
-                               make_merge_transform_v3_division_mod(make_tuple(Number<N0PerBlock>{}, Number<N1PerBlock>{})),
+                                   make_merge_transform_v3_division_mod(
+                                       make_tuple(Number<N0PerBlock>{}, Number<N1PerBlock>{})),
                                   make_pass_through_transform(K1)),
                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-                );
                    return b_block_desc_k0_n_k1_tmp;
-#else
+                }
+                else
+                {
                    return make_naive_tensor_descriptor(
                        make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
                        make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
-#endif
+                }
            }
            else
            {
@@ -253,32 +296,48 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        constexpr auto b_block_desc_b_k0_n_k1 = [&]() {
            if constexpr(BBlockLdsExtraN)
            {
-#if B_BLOCK_BANK_CONFLICT_FREE_WRW
+                if constexpr(BBlockLdsExtraN1Wrw)
+                {
                    constexpr auto b_block_desc_b_k0_n0_n1_k1 = make_naive_tensor_descriptor(
-                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<N0PerBlock>{}, Number<N1PerBlock>{}, K1),
+                        make_tuple(Number<1>{},
-                    make_tuple(Number<K0PerBlock>{} * Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding), Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding), Number<N1PerBlock>{} * K1 + N1Padding, K1, I1));
+                                   Number<K0PerBlock>{},
+                                   Number<N0PerBlock>{},
+                                   Number<N1PerBlock>{},
+                                   K1),
+                        make_tuple(Number<K0PerBlock>{} * Number<N0PerBlock>{} *
+                                       (Number<N1PerBlock>{} * K1 + N1Padding),
+                                   Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding),
+                                   Number<N1PerBlock>{} * K1 + N1Padding,
+                                   K1,
+                                   I1));
                    constexpr auto b_block_desc_b_k0_n_k1_tmp = transform_tensor_descriptor(
                        b_block_desc_b_k0_n0_n1_k1,
                        make_tuple(make_pass_through_transform(Number<1>{}),
                                   make_pass_through_transform(Number<K0PerBlock>{}),
-                               make_merge_transform_v3_division_mod_for_wrw(make_tuple(Number<N0PerBlock>{}, Number<N1PerBlock>{})),
+                                   make_merge_transform_v3_division_mod_for_wrw(
+                                       make_tuple(Number<N0PerBlock>{}, Number<N1PerBlock>{})),
                                   make_pass_through_transform(K1)),
                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-                );
                    return b_block_desc_b_k0_n_k1_tmp;
-#else
+                }
+                else
+                {
                    return make_naive_tensor_descriptor(
                        make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
-                    make_tuple(Number<K0PerBlock>{} * Number<NPerBlock + 1>{} * K1, Number<NPerBlock + 1>{} * K1, K1, I1));
+                        make_tuple(Number<K0PerBlock>{} * Number<NPerBlock + 1>{} * K1,
-#endif
+                                   Number<NPerBlock + 1>{} * K1,
+                                   K1,
+                                   I1));
+                }
            }
            else
            {
                return make_naive_tensor_descriptor_aligned(
-                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    max_lds_align);
            }
        }();
@@ -296,11 +355,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        constexpr auto b_b_k0_n_k1_block_desc = GetBBlockDescriptor_Batch_K0PerBlock_NPerBlock_K1();
        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_space_size =
+        constexpr auto a_block_space_size = math::integer_least_multiple(
-            math::integer_least_multiple(a_b_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+            a_b_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
-        constexpr auto b_block_space_size =
+        constexpr auto b_block_space_size = math::integer_least_multiple(
-            math::integer_least_multiple(b_b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+            b_b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
        constexpr auto c_block_size =
            GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock().GetElementSpaceSize();
@@ -557,8 +616,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        //       register
        // sanity check
-        constexpr index_t KPack = math::max(
+        constexpr index_t KPack =
-            K1, MfmaSelector<FloatAB, MPerXDL, NPerXDL>::selected_mfma.k_per_blk);
+            math::max(K1, MfmaSelector<FloatAB, MPerXDL, NPerXDL>::selected_mfma.k_per_blk);
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -42,6 +42,7 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    GridwiseGemm::template Run<HasMainK0BlockLoop>(
@@ -56,6 +57,18 @@ __global__ void
        b_element_op,
        c_element_op,
        block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 template <

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -45,6 +45,7 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    GridwiseGemm::template Run<HasMainK0BlockLoop>(
@@ -61,6 +62,20 @@ __global__ void
        b_element_op,
        c_element_op,
        block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = p_c0_grid;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 template <

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -49,6 +49,7 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    GridwiseGemm::template Run<HasMainK0BlockLoop>(
@@ -67,6 +68,22 @@ __global__ void
        b_element_op,
        c_element_op,
        block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = p_c0_grid;
+    ignore = p_c1_grid;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 template <

--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
@@ -36,6 +36,7 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe
                                        DataType value)
 {
    using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<DataType, DataType>;
    constexpr auto I0 = Number<0>{};

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -277,9 +277,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
        // TODO make this logic more generic for more sub-dword datatype
        if constexpr(SrcVectorDim != DstVectorDim &&
-                     is_same<half_t, remove_cvref_t<SrcData>>::value &&
+                     ((is_same<half_t, remove_cvref_t<SrcData>>::value &&
                       is_same<half_t, remove_cvref_t<DstData>>::value &&
-                     SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0)
+                       SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
+                      (is_same<int8_t, remove_cvref_t<SrcData>>::value &&
+                       is_same<int8_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
        {
            // each transpose does
            // DstScalarPerVector # of src vectors in src_thread_scratch_

--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -13,6 +13,7 @@
 #include "functional3.hpp"
 #include "functional4.hpp"
 #include "enable_if.hpp"
+#include "ignore.hpp"
 #include "integral_constant.hpp"
 #include "math.hpp"
 #include "number.hpp"
@@ -30,6 +31,7 @@
 #include "debug.hpp"
 #include "amd_buffer_addressing.hpp"
+#include "generic_memory_space_atomic_add.hpp"
 #include "get_id.hpp"
 #include "synchronization.hpp"
 #include "amd_address_space.hpp"

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -992,77 +992,6 @@ inline __host__ __device__ bhalf_t type_convert<bhalf_t, float>(float x)
    return uint16_t(u.int32 >> 16);
 }
-// TODO: deprecate this
-template <typename T>
-struct inner_product_with_conversion
-{
-    template <typename X, index_t N>
-    __device__ T operator()(typename vector_type<X, N>::type a,
-                            typename vector_type<X, N>::type b) const
-    {
-        const vector_type<X, N> a_vector{a};
-        const vector_type<X, N> b_vector{b};
-        T acc = 0;
-        static_for<0, N, 1>{}([&](auto i) {
-            acc += type_convert<T>(a_vector.Scalars()[i]) * type_convert<T>(b_vector.Scalars()[i]);
-        });
-        return acc;
-    }
-    __device__ T operator()(float_t a, float_t b) const
-    {
-        return type_convert<T>(a) * type_convert<T>(b);
-    }
-    __device__ T operator()(int8x4_t a, int8x4_t b) const
-    {
-        const vector_type<int8_t, 4> a_vector{a};
-        const vector_type<int8_t, 4> b_vector{b};
-        T acc = 0;
-        static_for<0, 4, 1>{}([&](auto i) {
-            acc += type_convert<T>(a_vector.AsType<int8_t>()[i]) *
-                   type_convert<T>(b_vector.AsType<int8_t>()[i]);
-        });
-        return acc;
-    }
-    __device__ T operator()(int8x8_t a, int8x8_t b) const
-    {
-        const vector_type<int8_t, 8> a_vector{a};
-        const vector_type<int8_t, 8> b_vector{b};
-        T acc = 0;
-        static_for<0, 8, 1>{}([&](auto i) {
-            acc += type_convert<T>(a_vector.AsType<int8_t>()[i]) *
-                   type_convert<T>(b_vector.AsType<int8_t>()[i]);
-        });
-        return acc;
-    }
-    __device__ T operator()(int8x16_t a, int8x16_t b) const
-    {
-        const vector_type<int8_t, 16> a_vector{a};
-        const vector_type<int8_t, 16> b_vector{b};
-        T acc = 0;
-        static_for<0, 16, 1>{}([&](auto i) {
-            acc += type_convert<T>(a_vector.AsType<int8_t>()[i]) *
-                   type_convert<T>(b_vector.AsType<int8_t>()[i]);
-        });
-        return acc;
-    }
-};
 template <typename T>
 struct NumericLimits
 {

--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
 #pragma once
-#include "amd_buffer_addressing.hpp"
-#include "c_style_pointer_cast.hpp"
 #include "config.hpp"
 #include "enable_if.hpp"
+#include "c_style_pointer_cast.hpp"
+#include "amd_buffer_addressing.hpp"
+#include "generic_memory_space_atomic_add.hpp"
 namespace ck {
+// T may be scalar or vector
+// X may be scalar or vector
+// T and X have same scalar type
+// X contains multiple T
 template <AddressSpaceEnum BufferAddressSpace,
          typename T,
          typename ElementSpaceSize,
@@ -266,9 +271,6 @@ struct DynamicBuffer
                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
 #else
-                //if(get_block_1d_id() == 0){
-                //    printf("%d, tid=%d, i=%d\n", __LINE__, get_thread_local_1d_id(), i);
-                //}
                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
 #endif
            }
@@ -319,9 +321,7 @@ struct DynamicBuffer
        {
            if(is_valid_element)
            {
-                // FIXME: atomicAdd is defined by HIP, need to avoid implicit type casting when
+                atomic_add<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
-                // calling it
-                atomicAdd(c_style_pointer_cast<X*>(&p_data_[i]), x);
            }
        }
    }

--- a/include/ck/utility/generic_memory_space_atomic_add.hpp
+++ b/include/ck/utility/generic_memory_space_atomic_add.hpp
+#pragma once
+#include "data_type.hpp"
+namespace ck {
+template <typename X>
+__device__ X atomic_add(X* p_dst, const X& x);
+template <>
+__device__ int32_t atomic_add<int32_t>(int32_t* p_dst, const int32_t& x)
+{
+    return atomicAdd(p_dst, x);
+}
+template <>
+__device__ uint32_t atomic_add<uint32_t>(uint32_t* p_dst, const uint32_t& x)
+{
+    return atomicAdd(p_dst, x);
+}
+template <>
+__device__ float atomic_add<float>(float* p_dst, const float& x)
+{
+    return atomicAdd(p_dst, x);
+}
+template <>
+__device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    const vector_type<float, 2> vx{x};
+    vector_type<float, 2> vy{0};
+    vy.template AsType<float>()(I0) =
+        atomicAdd(c_style_pointer_cast<float*>(p_dst), vx.template AsType<float>()[I0]);
+    vy.template AsType<float>()(I1) =
+        atomicAdd(c_style_pointer_cast<float*>(p_dst) + 1, vx.template AsType<float>()[I1]);
+    return vy.template AsType<float2_t>()[I0];
+}
+} // namespace ck
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
 #ifndef CK_MATH_V2_HPP
 #define CK_MATH_V2_HPP
+#include <cmath>
 #include "data_type.hpp"
+#include "half.hpp"
 namespace ck {
 namespace math {
-static inline __device__ half_t abs(half_t x) { return __habs(x); };
+static inline __host__ float abs(float x) { return std::abs(x); };
-static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
-static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
+static inline __host__ double abs(double x) { return std::abs(x); };
+static inline __host__ int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+    return (x ^ sgn) - sgn;
+};
+static inline __host__ int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+    return (x ^ sgn) - sgn;
+};
+static inline __host__ half_t abs(half_t x)
+{
+    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+    half_float::half abs_xx = half_float::abs(xx);
+    half_t abs_x = *reinterpret_cast<half_t*>(&abs_xx);
+    return abs_x;
+};
+static inline __host__ float isnan(float x) { return std::isnan(x); };
+static inline __host__ double isnan(double x) { return std::isnan(x); };
+static inline __host__ int8_t isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};
+static inline __host__ int32_t isnan(int32_t x)
+{
+    (void)x;
+    return false;
+};
+static inline __host__ bool isnan(half_t x)
+{
+    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+    return half_float::isnan(xx);
+};
 } // namespace math
 } // namespace ck

--- a/include/ck/utility/reduction_common.hpp
+++ b/include/ck/utility/reduction_common.hpp
@@ -33,7 +33,7 @@ namespace ck {
 struct float_equal_one
 {
    template <class T>
-    __device__ inline bool operator()(T x)
+    __host__ __device__ inline bool operator()(T x)
    {
        return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
    };
@@ -42,7 +42,7 @@ struct float_equal_one
 struct float_equal_zero
 {
    template <class T>
-    __device__ inline bool operator()(T x)
+    __host__ __device__ inline bool operator()(T x)
    {
        return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
    };

--- a/include/ck/utility/transpose_vectors.hpp
+++ b/include/ck/utility/transpose_vectors.hpp
@@ -49,7 +49,7 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
 template <index_t NX, index_t NY>
 struct transpose_vectors<half_t, NX, NY>
 {
-    // we got [NY * NX] ammount of S data to be transposed
+    // we got [NY * NX] amount of S data to be transposed
    static constexpr index_t s_per_x = NY;
    static constexpr index_t s_per_y = NX;
@@ -83,5 +83,86 @@ struct transpose_vectors<half_t, NX, NY>
    }
 };
+// transpose int8 4x4
+__device__ void transpose_int8_4x4(const int8x4_t& x0,
+                                   const int8x4_t& x1,
+                                   const int8x4_t& x2,
+                                   const int8x4_t& x3,
+                                   int8x4_t& y0,
+                                   int8x4_t& y1,
+                                   int8x4_t& y2,
+                                   int8x4_t& y3)
+{
+    int32_t t0, t1;
+    int32_t z0, z1, z2, z3;
+    constexpr int32_t m0 = 0x05010400;
+    constexpr int32_t m1 = 0x05040100;
+    constexpr int32_t m2 = 0x07060302;
+    constexpr int32_t m3 = 0x07030602;
+    // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
+    //                   -- -- -- --     -- -- -- --      -  -  -  -
+    //             index  7  6  5  4      3  2  1  0     33 77 44 88
+    // index is reversed because of little endianness (least significant bits first)
+    // clang-format off
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m0));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m0));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z0) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z1) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m3));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m3));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z2) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z3) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
+    // clang-format on
+    y0 = bit_cast<int8x4_t>(z0);
+    y1 = bit_cast<int8x4_t>(z1);
+    y2 = bit_cast<int8x4_t>(z2);
+    y3 = bit_cast<int8x4_t>(z3);
+}
+template <index_t NX, index_t NY>
+struct transpose_vectors<int8_t, NX, NY>
+{
+    // we got [NY * NX] amount of S data to be transposed
+    static constexpr index_t s_per_x = NY;
+    static constexpr index_t s_per_y = NX;
+    using S  = int8_t;
+    using VX = vector_type<int8_t, s_per_x>;
+    using VY = vector_type<int8_t, s_per_y>;
+    __device__ void operator()(const StaticallyIndexedArray<const VX&, NX>& vx_tuple,
+                               StaticallyIndexedArray<VY&, NY>& vy_tuple)
+    {
+        static constexpr auto I1 = Number<1>{};
+        static constexpr auto I2 = Number<2>{};
+        static constexpr auto I3 = Number<3>{};
+        static constexpr auto I4 = Number<4>{};
+        static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
+        // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
+        static_for<0, NY, 4>{}([&](auto iy) {
+            static_for<0, NX, 4>{}([&](auto ix) {
+                // reference to 4 int8 data from vx_tuple
+                const auto& x_s4_0 = vx_tuple[ix].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_1 = vx_tuple[ix + I1].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_2 = vx_tuple[ix + I2].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_3 = vx_tuple[ix + I3].template AsType<int8x4_t>()[iy / I4];
+                // reference to 4 int8 data from vy_tuple
+                auto& y_s4_0 = vy_tuple(iy).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_1 = vy_tuple(iy + I1).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_2 = vy_tuple(iy + I2).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_3 = vy_tuple(iy + I3).template AsType<int8x4_t>()(ix / I4);
+                // transpose
+                transpose_int8_4x4(x_s4_0, x_s4_1, x_s4_2, x_s4_3, y_s4_0, y_s4_1, y_s4_2, y_s4_3);
+            });
+        });
+    }
+};
 } // namespace ck
 #endif
--- a/library/CMakeLists.txt
+++ b/library/CMakeLists.txt
 add_subdirectory(src/host_tensor)
 add_subdirectory(src/tensor_operation_instance/gpu)
+add_subdirectory(src/utility)
--- a/library/include/ck/library/host_tensor/host_reduce_util.hpp
+++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp
@@ -26,7 +26,6 @@
 #ifndef GUARD_HOST_REDUCE_UTIL_HPP
 #define GUARD_HOST_REDUCE_UTIL_HPP
-#include <half.hpp>
 #include <limits>
 #include <cmath>
 #include <cassert>
@@ -34,6 +33,8 @@
 #include <string>
 #include "reduction_enums.hpp"
+#include "data_type.hpp"
+#include "math_v2.hpp"
 namespace ck {
@@ -42,34 +43,10 @@ namespace host_reduce {
 using ck::NanPropagation;
 using ck::ReduceTensorOp;
-template <typename T>
-static inline bool float_equal_one(T);
-static inline bool float_equal_one(float x) { return x == 1.0f; };
-static inline bool float_equal_one(double x) { return x == 1.0; };
-static inline bool float_equal_one(half_float::half x)
-{
-    return x == static_cast<half_float::half>(1.0f);
-};
-template <typename T>
-static inline bool float_equal_zero(T x);
-static inline bool float_equal_zero(float x) { return x == 0.0f; };
-static inline bool float_equal_zero(double x) { return x == 0.0; };
-static inline bool float_equal_zero(half_float::half x)
-{
-    return x == static_cast<half_float::half>(0.0f);
-};
 template <typename AccDataType, ReduceTensorOp ReduceOpId>
 __host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
 {
-    using std::abs;
+    using ck::math::abs;
    if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
    {
@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
    }
    else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
    {
-        return (std::numeric_limits<AccDataType>::max());
+        return (ck::NumericLimits<AccDataType>::Max());
    }
    else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
    {
-        return (std::numeric_limits<AccDataType>::lowest());
+        return (ck::NumericLimits<AccDataType>::Lowest());
    }
    else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
    {
@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
                     AccDataType& accuVal,
                     AccDataType currVal)
 {
-    using std::isnan;
+    using ck::math::isnan;
    if constexpr(!PropagateNan)
    {
@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
                      int& accuIndex,
                      int currIndex)
 {
-    using std::isnan;
+    using ck::math::isnan;
    if constexpr(!PropagateNan)
    {

--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -32,6 +32,7 @@
 #include <functional>
 #include "reduction_enums.hpp"
+#include "reduction_common.hpp"
 #include "host_reduce_util.hpp"
 #include "host_tensor.hpp"
 #include "data_type.hpp"
@@ -196,10 +197,10 @@ struct ReductionHost
                            OutDataType* out_data,
                            IndexDataType* out_indices)
    {
+        using ck::float_equal_one;
+        using ck::float_equal_zero;
        using ck::type_convert;
        using ck::host_reduce::binop_with_nan_check2;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
        using ck::host_reduce::ReduceOpFn2;
        using ck::host_reduce::ReduceOpZeroVal;
@@ -227,10 +228,10 @@ struct ReductionHost
            posUnaryOp(accuVal);
-            if(!float_equal_one(alpha))
+            if(!float_equal_one{}(alpha))
                accuVal *= type_convert<AccDataType>(alpha);
-            if(!float_equal_zero(beta))
+            if(!float_equal_zero{}(beta))
                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
            out_data[0]    = type_convert<OutDataType>(accuVal);
@@ -263,13 +264,13 @@ struct ReductionHost
                posUnaryOp(accuVal);
-                if(!float_equal_one(alpha))
+                if(!float_equal_one{}(alpha))
                    accuVal *= type_convert<AccDataType>(alpha);
                auto dst_offset =
                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
-                if(!float_equal_zero(beta))
+                if(!float_equal_zero{}(beta))
                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
                               type_convert<AccDataType>(beta);
@@ -303,10 +304,10 @@ struct ReductionHost
    void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
    {
+        using ck::float_equal_one;
+        using ck::float_equal_zero;
        using ck::type_convert;
        using ck::host_reduce::binop_with_nan_check;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
        using ck::host_reduce::ReduceOpFn;
        using ck::host_reduce::ReduceOpZeroVal;
@@ -330,10 +331,10 @@ struct ReductionHost
            posUnaryOp(accuVal);
-            if(!float_equal_one(alpha))
+            if(!float_equal_one{}(alpha))
                accuVal *= type_convert<AccDataType>(alpha);
-            if(!float_equal_zero(beta))
+            if(!float_equal_zero{}(beta))
                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
            out_data[0] = type_convert<OutDataType>(accuVal);
@@ -361,13 +362,13 @@ struct ReductionHost
                posUnaryOp(accuVal);
-                if(!float_equal_one(alpha))
+                if(!float_equal_one{}(alpha))
                    accuVal *= type_convert<AccDataType>(alpha);
                auto dst_offset =
                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
-                if(!float_equal_zero(beta))
+                if(!float_equal_zero{}(beta))
                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
                               type_convert<AccDataType>(beta);

--- a/library/include/ck/library/utility/conv_fwd_util.hpp
+++ b/library/include/ck/library/utility/conv_fwd_util.hpp
--- a/library/include/ck/library/utility/fill.hpp
+++ b/library/include/ck/library/utility/fill.hpp
+#pragma once
+#include <algorithm>
+#include <random>
+#include "data_type.hpp"
+namespace ck {
+namespace utils {
+// template <typename T, class Enable = void>
+// struct FillUniform;
+// TODO: what's wrong with this specialization???
+// err: segmentation fault in mt19937 - infinite loop like.
+// template <typename T>
+// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
+//                                               !std::is_same<T, bhalf_t>::value>::type>
+// {
+//     int a_{0};
+//     int b_{5};
+//     // T a_ = T{0};
+//     // T b_ = T{5};
+//     template <typename ForwardIter>
+//     void operator()(ForwardIter first, ForwardIter last) const
+//     {
+//         std::mt19937 gen{11939};
+//         std::uniform_int_distribution<int> dis(a_, b_);
+//         std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+//     }
+// };
+// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
+//                                               std::is_same<T, bhalf_t>::value>::type>
+template <typename T>
+struct FillUniform
+{
+    float a_{0};
+    float b_{5};
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::mt19937 gen{11939};
+        std::uniform_real_distribution<> dis(a_, b_);
+        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+    }
+};
+template <typename T>
+struct FillMonotonicSeq
+{
+    T init_value_{0};
+    T step_{1};
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::generate(first, last, [=, n = init_value_]() mutable {
+            auto tmp = n;
+            n += step_;
+            return tmp;
+        });
+    }
+};
+template <typename T>
+struct FillConstant
+{
+    T value_{0};
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::fill(first, last, value_);
+    }
+};
+} // namespace utils
+} // namespace ck
--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
+#pragma once
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+#include "check_err.hpp"
+#include "device_base.hpp"
+#include "functional2.hpp"
+namespace ck {
+namespace utils {
+struct ProfileBestConfig
+{
+    std::string best_op_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_tflops     = std::numeric_limits<float>::max();
+    float best_gb_per_sec = std::numeric_limits<float>::max();
+};
+/**
+ * @brief      This class describes an operation instance(s).
+ *
+ *             Op instance defines a particular specializations of operator
+ *             template. Thanks to this specific input/output data types, data
+ *             layouts and modifying elementwise operations it is able to create
+ *             it's input/output tensors, provide pointers to instances which
+ *             can execute it and all operation specific parameters.
+ */
+template <typename OutDataType, typename... InArgTypes>
+class OpInstance
+{
+    public:
+    template <typename T>
+    using TensorPtr      = std::unique_ptr<Tensor<T>>;
+    using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
+    using DeviceMemPtr   = std::unique_ptr<DeviceMem>;
+    using DeviceBuffers  = std::vector<DeviceMemPtr>;
+    OpInstance()                  = default;
+    OpInstance(const OpInstance&) = default;
+    OpInstance& operator=(const OpInstance&) = default;
+    virtual ~OpInstance(){};
+    virtual InTensorsTuple GetInputTensors() const         = 0;
+    virtual TensorPtr<OutDataType> GetOutputTensor() const = 0;
+    virtual std::unique_ptr<tensor_operation::device::BaseInvoker>
+    MakeInvokerPointer(tensor_operation::device::BaseOperator*) const = 0;
+    virtual std::unique_ptr<tensor_operation::device::BaseArgument>
+    MakeArgumentPointer(tensor_operation::device::BaseOperator*,
+                        const DeviceBuffers&,
+                        const DeviceMemPtr&) const = 0;
+    virtual std::size_t GetFlops() const           = 0;
+    virtual std::size_t GetBtype() const           = 0;
+};
+/**
+ * @brief      A generic operation instance run engine.
+ */
+template <typename OutDataType, typename... InArgTypes>
+class OpInstanceRunEngine
+{
+    public:
+    using OpInstanceT = OpInstance<InArgTypes..., OutDataType>;
+    template <typename T>
+    using TensorPtr        = std::unique_ptr<Tensor<T>>;
+    using DeviceMemPtr     = std::unique_ptr<DeviceMem>;
+    using InTensorsTuple   = std::tuple<TensorPtr<InArgTypes>...>;
+    using DeviceBuffers    = std::vector<DeviceMemPtr>;
+    using InArgsTypesTuple = std::tuple<InArgTypes...>;
+    OpInstanceRunEngine() = delete;
+    template <typename ReferenceOp = std::function<void()>>
+    OpInstanceRunEngine(const OpInstanceT& op_instance,
+                        const ReferenceOp& reference_op = ReferenceOp{})
+        : op_instance_{op_instance}
+    {
+        in_tensors_ = op_instance_.GetInputTensors();
+        out_tensor_ = op_instance_.GetOutputTensor();
+        if constexpr(std::is_invocable_v<ReferenceOp,
+                                         const Tensor<InArgTypes>&...,
+                                         Tensor<OutDataType>&>)
+        {
+            ref_output_ = op_instance_.GetOutputTensor();
+            CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+        }
+        AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
+        out_device_buffer_ =
+            std::make_unique<DeviceMem>(sizeof(OutDataType) * out_tensor_->mDesc.GetElementSpace());
+        out_device_buffer_->SetZero();
+    }
+    virtual ~OpInstanceRunEngine(){};
+    template <typename OpInstancePtr>
+    bool Test(const std::vector<OpInstancePtr>& op_ptrs)
+    {
+        bool res{true};
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
+            auto argument = op_instance_.MakeArgumentPointer(
+                op_ptr.get(), in_device_buffers_, out_device_buffer_);
+            if(op_ptr->IsSupportedArgument(argument.get()))
+            {
+                invoker->Run(argument.get());
+                out_device_buffer_->FromDevice(out_tensor_->mData.data());
+                if(!ref_output_)
+                {
+                    throw std::runtime_error(
+                        "OpInstanceRunEngine::Test: Reference value not availabe."
+                        " You have to provide reference function.");
+                }
+                // TODO: enable flexible use of custom check_error functions
+                res = res && check_err(out_tensor_->mData, ref_output_->mData);
+                out_device_buffer_->SetZero();
+            }
+        }
+        return res;
+    }
+    template <typename OpInstancePtr>
+    ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
+                              int nrepeat          = 100,
+                              bool do_verification = false,
+                              bool do_log          = false)
+    {
+        bool res{true};
+        ProfileBestConfig best_config;
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
+            auto argument = op_instance_.MakeArgumentPointer(
+                op_ptr.get(), in_device_buffers_, out_device_buffer_);
+            if(op_ptr->IsSupportedArgument(argument.get()))
+            {
+                std::string op_name = op_ptr->GetTypeString();
+                float avg_time      = invoker->Run(argument.get(), nrepeat);
+                std::size_t flops     = op_instance_.GetFlops();
+                std::size_t num_btype = op_instance_.GetBtype();
+                float tflops          = static_cast<float>(flops) / 1.E9 / avg_time;
+                float gb_per_sec      = num_btype / 1.E6 / avg_time;
+                std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                          << " GB/s, " << op_name << std::endl;
+                if(tflops < best_config.best_tflops)
+                {
+                    best_config.best_op_name    = op_name;
+                    best_config.best_tflops     = tflops;
+                    best_config.best_gb_per_sec = gb_per_sec;
+                    best_config.best_avg_time   = avg_time;
+                }
+                if(do_verification)
+                {
+                    out_device_buffer_->FromDevice(out_tensor_->mData.data());
+                    if(!ref_output_)
+                    {
+                        throw std::runtime_error(
+                            "OpInstanceRunEngine::Profile: Reference value not availabe."
+                            " You have to provide reference function.");
+                    }
+                    // TODO: enable flexible use of custom check_error functions
+                    res = res && CheckErr(out_tensor_->mData, ref_output_->mData);
+                    if(do_log) {}
+                }
+                out_device_buffer_->SetZero();
+            }
+        }
+        return best_config;
+    }
+    void SetAtol(double a) { atol_ = a; }
+    void SetRtol(double r) { rtol_ = r; }
+    private:
+    template <typename F, std::size_t... Is>
+    void CallRefOpUnpackArgs(const F& f, std::index_sequence<Is...>) const
+    {
+        f(*std::get<Is>(in_tensors_)..., *ref_output_);
+    }
+    template <std::size_t... Is>
+    void AllocateDeviceInputTensors(std::index_sequence<Is...>)
+    {
+        (AllocateDeviceInputTensorsImpl<Is>(), ...);
+    }
+    template <std::size_t Index>
+    void AllocateDeviceInputTensorsImpl()
+    {
+        const auto& ts = std::get<Index>(in_tensors_);
+        in_device_buffers_
+            .emplace_back(
+                std::make_unique<DeviceMem>(sizeof(std::tuple_element_t<Index, InArgsTypesTuple>) *
+                                            ts->mDesc.GetElementSpace()))
+            ->ToDevice(ts->mData.data());
+    }
+    static constexpr std::size_t kNInArgs_ = std::tuple_size_v<InTensorsTuple>;
+    const OpInstanceT& op_instance_;
+    double rtol_{1e-5};
+    double atol_{1e-8};
+    InTensorsTuple in_tensors_;
+    TensorPtr<OutDataType> out_tensor_;
+    TensorPtr<OutDataType> ref_output_;
+    DeviceBuffers in_device_buffers_;
+    DeviceMemPtr out_device_buffer_;
+    template <typename T>
+    bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
+    {
+        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_);
+    }
+};
+} // namespace utils
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -31,7 +31,6 @@ using device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances = std::tuple<
        //DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    256,   128,   256,    4,  8,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,    true,     S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              8,      true,        1,            1,     S<1, 32, 1, 8>,        8>,
        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    128,   128,   128,    4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,    true,     S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,      true,        1,            1,     S<1, 32, 1, 4>,        8>,
        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    256,   128,   128,    4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,    true,     S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,      true,        1,            1,     S<1, 32, 1, 4>,        8>,
        //DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    256,   128,   128,    4,  8,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,    true,     S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,      true,        1,            1,     S<1, 32, 1, 4>,        8>,