Merge remote-tracking branch 'origin/develop' into myamlak/issue153-sign-compare

8c03672b · myamlak · 1eb7d83b · 3956085d · 8c03672b · 8c03672b
Commit 8c03672b authored Apr 28, 2022 by myamlak
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -42,7 +42,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    libnuma-dev \
    libpthread-stubs0-dev \
    llvm-amdgpu \
-    miopengemm \
    pkg-config \
    python \
    python3 \
@@ -51,19 +50,15 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    python-pip \
    python3-pip \
    software-properties-common \
-    sqlite3 \
    wget \
    rocm-dev \
    rocm-device-libs \
-    rocm-opencl \
-    rocm-opencl-dev \
    rocm-cmake \
-    rocblas \
    vim \
    zlib1g-dev \
    openssh-server \
-    kmod \
-    mysql-client && \
+    clang-format-10 \
+    kmod && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*


--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -204,7 +204,7 @@ pipeline {
                stage('Clang Format') {
                    agent{ label rocmnode("nogpu") }
                    environment{
-                        execute_cmd = "find . -iname \'*.h\' \
+                        execute_cmd = "find .. -iname \'*.h\' \
                                -o -iname \'*.hpp\' \
                                -o -iname \'*.cpp\' \
                                -o -iname \'*.h.in\' \

--- a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
+++ b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
 add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp)
+target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_fwd_util)
--- a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
 add_example_executable(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp)
+target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_fwd_util)
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
 add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp)
+target_link_libraries(example_convnd_fwd_xdl PRIVATE conv_fwd_util)
 add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
+target_link_libraries(example_convnd_fwd_xdl_int8 PRIVATE conv_fwd_util)
 add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
+target_link_libraries(example_convnd_fwd_xdl_fp16 PRIVATE conv_fwd_util)
--- a/example/10_conv2d_bwd_data/CMakeLists.txt
+++ b/example/10_conv2d_bwd_data/CMakeLists.txt
 add_example_executable(example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp)
+target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_fwd_util)
--- a/example/11_conv2d_bwd_weight/CMakeLists.txt
+++ b/example/11_conv2d_bwd_weight/CMakeLists.txt
 add_example_executable(example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp)
+target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_fwd_util)
--- a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+++ b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
@@ -72,8 +72,13 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
        8>;                               // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on

-using ReferenceConvBwdWeightInstance = ck::tensor_operation::host::
-    ReferenceConvBwdWeight<InDataType, WeiDataType, OutDataType, InElementOp, WeiElementOp, OutElementOp>;
+using ReferenceConvBwdWeightInstance =
+    ck::tensor_operation::host::ReferenceConvBwdWeight<InDataType,
+                                                       WeiDataType,
+                                                       OutDataType,
+                                                       InElementOp,
+                                                       WeiElementOp,
+                                                       OutElementOp>;

 int main(int argc, char* argv[])
 {

--- a/example/17_convnd_bwd_data_xdl/CMakeLists.txt
+++ b/example/17_convnd_bwd_data_xdl/CMakeLists.txt
 add_example_executable(example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp)
+target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_fwd_util)
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
@@ -16,6 +16,31 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2CTileMap Block2CTileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
+ * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
+ * pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2CTileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
 template <typename GridwiseGemm,
          typename FloatAB,
          typename FloatC,
@@ -25,7 +50,7 @@ template <typename GridwiseGemm,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
-          typename ComputeBasePrtOfBatch,
+          typename ComputePtrOffsetOfBatch,
          typename Block2CTileMap,
          bool HasMainKBlockLoop>
 __global__ void
@@ -43,7 +68,7 @@ __global__ void
            const AElementwiseOperation a_element_op,
            const BElementwiseOperation b_element_op,
            const CElementwiseOperation c_element_op,
-            const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
            const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
@@ -52,11 +77,11 @@ __global__ void
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);

    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));

    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

@@ -256,26 +281,26 @@ struct DeviceBatchedGemmXdl
        return globalblockid_to_m0_n0_block_cluster_adaptor;
    }

-    struct ComputeBasePtrOfStridedBatch
+    struct ComputePtrOffsetOfStridedBatch
    {
-        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
+        ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
                                       index_t BatchStrideB,
                                       index_t BatchStrideC)
            : BatchStrideA_(BatchStrideA), BatchStrideB_(BatchStrideB), BatchStrideC_(BatchStrideC)
        {
        }

-        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
        {
            return g_idx * static_cast<long_index_t>(BatchStrideA_);
        }

-        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
        {
            return g_idx * static_cast<long_index_t>(BatchStrideB_);
        }

-        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
        {
            return g_idx * static_cast<long_index_t>(BatchStrideC_);
        }
@@ -359,7 +384,7 @@ struct DeviceBatchedGemmXdl
                  DeviceBatchedGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB)},
              c_grid_desc_m_n_{DeviceBatchedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC)},
              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
-              compute_base_ptr_of_batch_{a_grid_desc_k0_m_k1_.GetElementSpaceSize(),
+              compute_ptr_offset_of_batch_{a_grid_desc_k0_m_k1_.GetElementSpaceSize(),
                                          b_grid_desc_k0_n_k1_.GetElementSpaceSize(),
                                          c_grid_desc_m_n_.GetElementSpaceSize()},
              block_2_ctile_map_{},
@@ -388,7 +413,7 @@ struct DeviceBatchedGemmXdl
        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
        CGridDesc_M_N c_grid_desc_m_n_;
        CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
-        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
        Block2CTileMap block_2_ctile_map_;
        index_t M01_;
        index_t N01_;
@@ -448,7 +473,7 @@ struct DeviceBatchedGemmXdl
                    AElementwiseOperation,
                    BElementwiseOperation,
                    CElementwiseOperation,
-                    ComputeBasePtrOfStridedBatch,
+                    ComputePtrOffsetOfStridedBatch,
                    remove_reference_t<Block2CTileMap>,
                    true>;

@@ -467,7 +492,7 @@ struct DeviceBatchedGemmXdl
                                                  arg.a_element_op_,
                                                  arg.b_element_op_,
                                                  arg.c_element_op_,
-                                                  arg.compute_base_ptr_of_batch_,
+                                                  arg.compute_ptr_offset_of_batch_,
                                                  arg.block_2_ctile_map_);
            }
            else
@@ -482,7 +507,7 @@ struct DeviceBatchedGemmXdl
                    AElementwiseOperation,
                    BElementwiseOperation,
                    CElementwiseOperation,
-                    ComputeBasePtrOfStridedBatch,
+                    ComputePtrOffsetOfStridedBatch,
                    remove_reference_t<Block2CTileMap>,
                    false>;

@@ -501,7 +526,7 @@ struct DeviceBatchedGemmXdl
                                                  arg.a_element_op_,
                                                  arg.b_element_op_,
                                                  arg.c_element_op_,
-                                                  arg.compute_base_ptr_of_batch_,
+                                                  arg.compute_ptr_offset_of_batch_,
                                                  arg.block_2_ctile_map_);
            }


--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -18,6 +18,9 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

+/*
+ * \see \link device_batched_gemm_xdl.hpp kernel_batched_gemm_xdlops_v2r3() \endlink.
+ */
 template <typename GridwiseGemm,
          typename FloatAB,
          typename FloatC,

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -277,9 +277,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
        // TODO make this logic more generic for more sub-dword datatype
        if constexpr(SrcVectorDim != DstVectorDim &&
-                     is_same<half_t, remove_cvref_t<SrcData>>::value &&
+                     ((is_same<half_t, remove_cvref_t<SrcData>>::value &&
                       is_same<half_t, remove_cvref_t<DstData>>::value &&
-                     SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0)
+                       SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
+                      (is_same<int8_t, remove_cvref_t<SrcData>>::value &&
+                       is_same<int8_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
        {
            // each transpose does
            // DstScalarPerVector # of src vectors in src_thread_scratch_

--- a/include/ck/utility/transpose_vectors.hpp
+++ b/include/ck/utility/transpose_vectors.hpp
@@ -49,7 +49,7 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
 template <index_t NX, index_t NY>
 struct transpose_vectors<half_t, NX, NY>
 {
-    // we got [NY * NX] ammount of S data to be transposed
+    // we got [NY * NX] amount of S data to be transposed
    static constexpr index_t s_per_x = NY;
    static constexpr index_t s_per_y = NX;

@@ -83,5 +83,86 @@ struct transpose_vectors<half_t, NX, NY>
    }
 };

+// transpose int8 4x4
+__device__ void transpose_int8_4x4(const int8x4_t& x0,
+                                   const int8x4_t& x1,
+                                   const int8x4_t& x2,
+                                   const int8x4_t& x3,
+                                   int8x4_t& y0,
+                                   int8x4_t& y1,
+                                   int8x4_t& y2,
+                                   int8x4_t& y3)
+{
+    int32_t t0, t1;
+    int32_t z0, z1, z2, z3;
+    constexpr int32_t m0 = 0x05010400;
+    constexpr int32_t m1 = 0x05040100;
+    constexpr int32_t m2 = 0x07060302;
+    constexpr int32_t m3 = 0x07030602;
+
+    // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
+    //                   -- -- -- --     -- -- -- --      -  -  -  -
+    //             index  7  6  5  4      3  2  1  0     33 77 44 88
+    // index is reversed because of little endianness (least significant bits first)
+    // clang-format off
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m0));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m0));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z0) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z1) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m3));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m3));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z2) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z3) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
+    // clang-format on
+
+    y0 = bit_cast<int8x4_t>(z0);
+    y1 = bit_cast<int8x4_t>(z1);
+    y2 = bit_cast<int8x4_t>(z2);
+    y3 = bit_cast<int8x4_t>(z3);
+}
+
+template <index_t NX, index_t NY>
+struct transpose_vectors<int8_t, NX, NY>
+{
+    // we got [NY * NX] amount of S data to be transposed
+    static constexpr index_t s_per_x = NY;
+    static constexpr index_t s_per_y = NX;
+
+    using S  = int8_t;
+    using VX = vector_type<int8_t, s_per_x>;
+    using VY = vector_type<int8_t, s_per_y>;
+
+    __device__ void operator()(const StaticallyIndexedArray<const VX&, NX>& vx_tuple,
+                               StaticallyIndexedArray<VY&, NY>& vy_tuple)
+    {
+        static constexpr auto I1 = Number<1>{};
+        static constexpr auto I2 = Number<2>{};
+        static constexpr auto I3 = Number<3>{};
+        static constexpr auto I4 = Number<4>{};
+
+        static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
+
+        // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
+        static_for<0, NY, 4>{}([&](auto iy) {
+            static_for<0, NX, 4>{}([&](auto ix) {
+                // reference to 4 int8 data from vx_tuple
+                const auto& x_s4_0 = vx_tuple[ix].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_1 = vx_tuple[ix + I1].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_2 = vx_tuple[ix + I2].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_3 = vx_tuple[ix + I3].template AsType<int8x4_t>()[iy / I4];
+
+                // reference to 4 int8 data from vy_tuple
+                auto& y_s4_0 = vy_tuple(iy).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_1 = vy_tuple(iy + I1).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_2 = vy_tuple(iy + I2).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_3 = vy_tuple(iy + I3).template AsType<int8x4_t>()(ix / I4);
+
+                // transpose
+                transpose_int8_4x4(x_s4_0, x_s4_1, x_s4_2, x_s4_3, y_s4_0, y_s4_1, y_s4_2, y_s4_3);
+            });
+        });
+    }
+};
+
 } // namespace ck
 #endif
--- a/library/CMakeLists.txt
+++ b/library/CMakeLists.txt
 add_subdirectory(src/host_tensor)
 add_subdirectory(src/tensor_operation_instance/gpu)
+add_subdirectory(src/utility)
--- a/library/include/ck/library/utility/conv_fwd_util.hpp
+++ b/library/include/ck/library/utility/conv_fwd_util.hpp
-#ifndef CONV_FWD_UTIL_HPP
-#define CONV_FWD_UTIL_HPP
+#pragma once

-#include <algorithm>
 #include <cstdlib>
 #include <functional>
 #include <iterator>
 #include <numeric>
 #include <sstream>
-#include <random>
 #include <tuple>
 #include <type_traits>
 #include <vector>
@@ -18,10 +15,50 @@
 #include "device_conv_fwd.hpp"
 #include "device_tensor.hpp"
 #include "element_wise_operation.hpp"
+#include "fill.hpp"
 #include "host_tensor.hpp"
+#include "op_instance_engine.hpp"
 #include "reference_conv_fwd.hpp"
 #include "tensor_layout.hpp"

+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr<element_wise::PassThrough,
+                                              element_wise::PassThrough,
+                                              element_wise::PassThrough>;
+namespace device_conv1d_fwd_instance {
+
+void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+
+} // namespace device_conv1d_fwd_instance
+namespace device_conv2d_fwd_instance {
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+
+} // namespace device_conv2d_fwd_instance
+namespace device_conv3d_fwd_instance {
+
+void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+
+} // namespace device_conv3d_fwd_instance
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
 namespace ck {
 namespace utils {
 namespace conv {
@@ -47,20 +84,7 @@ std::size_t get_flops(ck::index_t N,
                      ck::index_t C,
                      ck::index_t K,
                      const std::vector<ck::index_t>& filter_spatial_lengths,
-                      const std::vector<ck::index_t>& output_spatial_lengths)
-{
-    // 2 * N * K * <output spatial lengths product> * C * <filter spatial lengths product>
-    return static_cast<std::size_t>(2) * N * K *
-           std::accumulate(std::begin(output_spatial_lengths),
-                           std::end(output_spatial_lengths),
-                           static_cast<std::size_t>(1),
-                           std::multiplies<std::size_t>()) *
-           C *
-           std::accumulate(std::begin(filter_spatial_lengths),
-                           std::end(filter_spatial_lengths),
-                           static_cast<std::size_t>(1),
-                           std::multiplies<std::size_t>());
-}
+                      const std::vector<ck::index_t>& output_spatial_lengths);

 /**
 * @brief      Calculate number of bytes read/write by convolution algorithm.
@@ -110,20 +134,7 @@ std::size_t get_btype(ck::index_t N,

 struct ConvParams
 {
-    ConvParams()
-        : num_dim_spatial(2),
-          N(128),
-          K(256),
-          C(192),
-          filter_spatial_lengths(2, 3),
-          input_spatial_lengths(2, 71),
-          conv_filter_strides(2, 2),
-          conv_filter_dilations(2, 1),
-          input_left_pads(2, 1),
-          input_right_pads(2, 1)
-    {
-    }
-
+    ConvParams();
    ConvParams(ck::index_t n_dim,
               ck::index_t n_batch,
               ck::index_t n_out_channels,
@@ -133,30 +144,7 @@ struct ConvParams
               const std::vector<ck::index_t>& strides,
               const std::vector<ck::index_t>& dilations,
               const std::vector<ck::index_t>& left_pads,
-               const std::vector<ck::index_t>& right_pads)
-        : num_dim_spatial(n_dim),
-          N(n_batch),
-          K(n_out_channels),
-          C(n_in_channels),
-          filter_spatial_lengths(filters_len),
-          input_spatial_lengths(input_len),
-          conv_filter_strides(strides),
-          conv_filter_dilations(dilations),
-          input_left_pads(left_pads),
-          input_right_pads(right_pads)
-    {
-        if(ck::type_convert<ck::index_t>(filter_spatial_lengths.size()) != num_dim_spatial ||
-           ck::type_convert<ck::index_t>(input_spatial_lengths.size()) != num_dim_spatial ||
-           ck::type_convert<ck::index_t>(conv_filter_strides.size()) != num_dim_spatial ||
-           ck::type_convert<ck::index_t>(conv_filter_dilations.size()) != num_dim_spatial ||
-           ck::type_convert<ck::index_t>(input_left_pads.size()) != num_dim_spatial ||
-           ck::type_convert<ck::index_t>(input_right_pads.size()) != num_dim_spatial)
-        {
-            throw(std::runtime_error(
-                "ConvParams::GetOutputSpatialLengths: "
-                "parameter size is different from number of declared dimensions!"));
-        }
-    }
+               const std::vector<ck::index_t>& right_pads);

    ck::index_t num_dim_spatial;
    ck::index_t N;
@@ -172,36 +160,11 @@ struct ConvParams
    std::vector<ck::index_t> input_left_pads;
    std::vector<ck::index_t> input_right_pads;

-    std::vector<ck::index_t> GetOutputSpatialLengths() const
-    {
-        if(ck::type_convert<ck::index_t>(filter_spatial_lengths.size()) != num_dim_spatial ||
-           ck::type_convert<ck::index_t>(input_spatial_lengths.size()) != num_dim_spatial ||
-           ck::type_convert<ck::index_t>(conv_filter_strides.size()) != num_dim_spatial ||
-           ck::type_convert<ck::index_t>(conv_filter_dilations.size()) != num_dim_spatial ||
-           ck::type_convert<ck::index_t>(input_left_pads.size()) != num_dim_spatial ||
-           ck::type_convert<ck::index_t>(input_right_pads.size()) != num_dim_spatial)
-        {
-            throw(std::runtime_error(
-                "ConvParams::GetOutputSpatialLengths: "
-                "parameter size is different from number of declared dimensions!"));
-        }
-
-        std::vector<ck::index_t> out_spatial_len(num_dim_spatial, 0);
-        for(ck::index_t i = 0; i < num_dim_spatial; ++i)
-        {
-            // XEff = (X - 1) * conv_dilation_w + 1;
-            // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-            const ck::index_t idx_eff =
-                (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1;
-            out_spatial_len[i] =
-                (input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) /
-                    conv_filter_strides[i] +
-                1;
-        }
-        return out_spatial_len;
-    }
+    std::vector<ck::index_t> GetOutputSpatialLengths() const;
 };

+ConvParams parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[]);
+
 /**
 * @brief      Gets the host tensor descriptor.
 *
@@ -223,13 +186,13 @@ HostTensorDescriptor get_host_tensor_descriptor(const std::vector<std::size_t>&
                 std::is_same<TensorLayout, ck::tensor_layout::convolution::NKW>::value)
    {

-        return HostTensorDescriptor(dims, std::vector<std::size_t>({C * dims[2], dims[2], 1}));
+        return HostTensorDescriptor(dims, std::vector<std::size_t>{C * dims[2], dims[2], 1});
    }
    else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NWC>::value ||
                      std::is_same<TensorLayout, ck::tensor_layout::convolution::KXC>::value ||
                      std::is_same<TensorLayout, ck::tensor_layout::convolution::NWK>::value)
    {
-        return HostTensorDescriptor(dims, std::vector<std::size_t>({C * dims[2], 1, C}));
+        return HostTensorDescriptor(dims, std::vector<std::size_t>{C * dims[2], 1, C});
    }
    // 2D
    else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NCHW>::value ||
@@ -275,132 +238,14 @@ HostTensorDescriptor get_host_tensor_descriptor(const std::vector<std::size_t>&
    throw std::runtime_error(err_msg.str());
 }

-template <typename InDataType  = float,
-          typename WeiDataType = float,
-          typename OutDataType = float,
-          typename InLayout    = ck::tensor_layout::convolution::NHWC,
-          typename WeiLayout   = ck::tensor_layout::convolution::KYXC,
-          typename OutLayout   = ck::tensor_layout::convolution::NHWK>
-auto get_host_tensors(const ConvParams& params, bool init = true)
-{
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
-                                        static_cast<std::size_t>(params.C)};
-    input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths),
-                      std::end(params.input_spatial_lengths));
-
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
-                                         static_cast<std::size_t>(params.C)};
-    filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths),
-                       std::end(params.filter_spatial_lengths));
-
-    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
-                                         static_cast<std::size_t>(params.K)};
-    output_dims.insert(std::end(output_dims),
-                       std::begin(output_spatial_lengths),
-                       std::end(output_spatial_lengths));
-
-    Tensor<InDataType> input(ck::utils::conv::get_host_tensor_descriptor(input_dims, InLayout{}));
-    Tensor<WeiDataType> weights(
-        ck::utils::conv::get_host_tensor_descriptor(filter_dims, WeiLayout{}));
-    Tensor<OutDataType> host_output(
-        ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{}));
-    Tensor<OutDataType> device_output(
-        ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{}));
-
-    if(init)
-    {
-        std::mt19937 gen(11939);
-        if constexpr(std::is_same<InDataType, uint8_t>::value)
-        {
-            std::uniform_int_distribution<> dis(-5, 5);
-            std::generate(
-                input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); });
-            std::generate(
-                weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); });
-        }
-        else
-        {
-            std::uniform_real_distribution<> dis(0.f, 1.f);
-            std::generate(
-                input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); });
-            std::generate(
-                weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); });
-        }
-        std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
-        std::fill(device_output.begin(), device_output.end(), OutDataType(0.f));
-    }
-
-    return std::make_tuple(input, weights, host_output, device_output);
-}
-
 HostTensorDescriptor get_output_host_tensor_descriptor(const std::vector<std::size_t>& dims,
-                                                       int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWK{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWK{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWK{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
+                                                       int num_dim_spatial = 2);

 HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
-                                                        int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KZYXC{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KYXC{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KXC{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
+                                                        int num_dim_spatial = 2);

 HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
-                                                      int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
+                                                      int num_dim_spatial = 2);

 template <ck::index_t NDim,
          typename InDataType  = float,
@@ -434,123 +279,293 @@ void run_reference_convolution_forward(const ConvParams& params,
    ref_invoker.Run(ref_argument);
 }

-template <ck::index_t NDim,
-          typename InDataType  = float,
-          typename WeiDataType = float,
-          typename OutDataType = float,
-          template <ck::index_t, typename, typename, typename>
-          class DeviceConvNDFwdInstance>
-void run_convolution_forward(const ConvParams& params,
-                             const Tensor<InDataType>& input,
-                             const Tensor<WeiDataType>& weights,
-                             Tensor<OutDataType>& output)
+template <typename InDataType, typename WeiDataType, typename OutDataType>
+struct ConvolutionFwdInstances;
+
+template <>
+struct ConvolutionFwdInstances<float, float, float>
 {
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    template <int NumDimSpatial,
+              typename std::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false>
+    static std::vector<DeviceConvFwdNoOpPtr> Get()
+    {
+        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
+        if constexpr(NumDimSpatial == 1)
+        {
+            ck::tensor_operation::device::device_conv1d_fwd_instance::
+                add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
+        }
+        else if constexpr(NumDimSpatial == 2)
+        {
+            ck::tensor_operation::device::device_conv2d_fwd_instance::
+                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
+        }
+        else if constexpr(NumDimSpatial == 3)
+        {
+            ck::tensor_operation::device::device_conv3d_fwd_instance::
+                add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
+        }
+        return conv_ptrs;
+    }
+};

-    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
-
-    in_device_buf.ToDevice(input.mData.data());
-    wei_device_buf.ToDevice(weights.mData.data());
-    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-
-    auto conv     = DeviceConvNDFwdInstance<NDim, InDataType, WeiDataType, OutDataType>();
-    auto invoker  = conv.MakeInvoker();
-    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                      params.N,
-                                      params.K,
-                                      params.C,
-                                      params.input_spatial_lengths,
-                                      params.filter_spatial_lengths,
-                                      output_spatial_lengths,
-                                      params.conv_filter_strides,
-                                      params.conv_filter_dilations,
-                                      params.input_left_pads,
-                                      params.input_right_pads,
-                                      PassThrough{},
-                                      PassThrough{},
-                                      PassThrough{});
+template <>
+struct ConvolutionFwdInstances<half_t, half_t, half_t>
+{
+    template <int NumDimSpatial,
+              typename std::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false>
+    static std::vector<DeviceConvFwdNoOpPtr> Get()
+    {
+        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
+        if constexpr(NumDimSpatial == 1)
+        {
+            ck::tensor_operation::device::device_conv1d_fwd_instance::
+                add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
+            return conv_ptrs;
+        }
+        else if constexpr(NumDimSpatial == 2)
+        {
+            ck::tensor_operation::device::device_conv2d_fwd_instance::
+                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+            ck::tensor_operation::device::device_conv2d_fwd_instance::
+                add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+        }
+        else if constexpr(NumDimSpatial == 3)
+        {
+            ck::tensor_operation::device::device_conv3d_fwd_instance::
+                add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
+        }
+        return conv_ptrs;
+    }
+};

-    if(!conv.IsSupportedArgument(argument))
+template <>
+struct ConvolutionFwdInstances<bhalf_t, bhalf_t, bhalf_t>
+{
+    template <int NumDimSpatial,
+              typename std::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false>
+    static std::vector<DeviceConvFwdNoOpPtr> Get()
    {
-        throw std::runtime_error(
-            "Error! device_conv with the specified compilation parameters does "
-            "not support this Conv problem");
+        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
+        if constexpr(NumDimSpatial == 1)
+        {
+            ck::tensor_operation::device::device_conv1d_fwd_instance::
+                add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
+        }
+        else if constexpr(NumDimSpatial == 2)
+        {
+            ck::tensor_operation::device::device_conv2d_fwd_instance::
+                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
        }
+        else if constexpr(NumDimSpatial == 3)
+        {
+            ck::tensor_operation::device::device_conv3d_fwd_instance::
+                add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
+        }
+        return conv_ptrs;
+    }
+};

-    invoker.Run(argument);
-    out_device_buf.FromDevice(output.mData.data());
-}
+template <>
+struct ConvolutionFwdInstances<int8_t, int8_t, int8_t>
+{
+    template <int NumDimSpatial,
+              typename std::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false>
+    static std::vector<DeviceConvFwdNoOpPtr> Get()
+    {
+        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
+        if constexpr(NumDimSpatial == 1)
+        {
+            ck::tensor_operation::device::device_conv1d_fwd_instance::
+                add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs);
+        }
+        else if constexpr(NumDimSpatial == 2)
+        {
+            ck::tensor_operation::device::device_conv2d_fwd_instance::
+                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
+        }
+        else if constexpr(NumDimSpatial == 3)
+        {
+            ck::tensor_operation::device::device_conv3d_fwd_instance::
+                add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs);
+        }
+        return conv_ptrs;
+    }
+};

-template <ck::index_t NDim,
-          typename InDataType  = float,
-          typename WeiDataType = float,
-          typename OutDataType = float>
-bool run_convolution_forward_instances(const ConvParams& params,
-                                       const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs,
-                                       const Tensor<InDataType>& input,
-                                       const Tensor<WeiDataType>& weights,
-                                       Tensor<OutDataType>& output,
-                                       const Tensor<OutDataType>& host_output)
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout         = ck::tensor_layout::convolution::NHWC,
+          typename WeiLayout        = ck::tensor_layout::convolution::KYXC,
+          typename OutLayout        = ck::tensor_layout::convolution::NHWK,
+          typename InElementwiseOp  = ck::tensor_operation::element_wise::PassThrough,
+          typename WeiElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
+          typename OutElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
+          typename InputInitFun     = FillUniform<InDataType>,
+          typename WeightsInitFun   = FillUniform<WeiDataType>>
+class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, WeiDataType>
 {
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using DeviceConvFwdOp = tensor_operation::device::
+        DeviceConvFwd<InElementwiseOp, WeiElementwiseOp, OutElementwiseOp>;
+    using DeviceMemPtr  = std::unique_ptr<DeviceMem>;
+    using DeviceBuffers = std::vector<DeviceMemPtr>;
+    using BaseType      = ck::utils::OpInstance<OutDataType, InDataType, WeiDataType>;
+    template <typename T>
+    using TensorPtr      = std::unique_ptr<Tensor<T>>;
+    using InTensorsTuple = std::tuple<TensorPtr<InDataType>, TensorPtr<WeiDataType>>;
+
+    public:
+    ConvFwdOpInstance()                         = delete;
+    ConvFwdOpInstance(const ConvFwdOpInstance&) = default;
+    ConvFwdOpInstance& operator=(const ConvFwdOpInstance&) = default;
+
+    ConvFwdOpInstance(const ConvParams& params,
+                      bool do_init                         = true,
+                      const InputInitFun& input_init_f     = InputInitFun{},
+                      const WeightsInitFun& weights_init_f = WeightsInitFun{})
+        : BaseType(),
+          params_{params},
+          output_spatial_lengths_{params.GetOutputSpatialLengths()},
+          do_init_{do_init},
+          input_init_f_{input_init_f},
+          weights_init_f_{weights_init_f}
+    {
+    }

-    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
-
-    in_device_buf.ToDevice(input.mData.data());
-    wei_device_buf.ToDevice(weights.mData.data());
-    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-
-    bool res{true};
-    for(auto& conv_ptr : conv_ptrs)
-    {
-        auto invoker  = conv_ptr->MakeInvokerPointer();
-        auto argument = conv_ptr->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            params.N,
-            params.K,
-            params.C,
-            params.input_spatial_lengths,
-            params.filter_spatial_lengths,
-            output_spatial_lengths,
-            params.conv_filter_strides,
-            params.conv_filter_dilations,
-            params.input_left_pads,
-            params.input_right_pads,
-            PassThrough{},
-            PassThrough{},
-            PassThrough{});
+    virtual ~ConvFwdOpInstance() override{};

-        if(conv_ptr->IsSupportedArgument(argument.get()))
+    virtual InTensorsTuple GetInputTensors() const override
    {
-            float atol{1e-5f};
-            float rtol{1e-4f};
-            if constexpr(std::is_same_v<InDataType, ck::half_t>)
+        std::vector<std::size_t> input_dims{static_cast<std::size_t>(params_.N),
+                                            static_cast<std::size_t>(params_.C)};
+        input_dims.insert(std::end(input_dims),
+                          std::begin(params_.input_spatial_lengths),
+                          std::end(params_.input_spatial_lengths));
+
+        std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params_.K),
+                                             static_cast<std::size_t>(params_.C)};
+        filter_dims.insert(std::end(filter_dims),
+                           std::begin(params_.filter_spatial_lengths),
+                           std::end(params_.filter_spatial_lengths));
+
+        auto input = std::make_unique<Tensor<InDataType>>(
+            get_host_tensor_descriptor(input_dims, InLayout{}));
+        auto weights = std::make_unique<Tensor<WeiDataType>>(
+            get_host_tensor_descriptor(filter_dims, WeiLayout{}));
+
+        if(do_init_)
        {
-                atol = 1e-4f;
-                rtol = 2.5e-3f;
+            input_init_f_(input->begin(), input->end());
+            weights_init_f_(weights->begin(), weights->end());
        }
-            invoker->Run(argument.get());
-            out_device_buf.FromDevice(output.mData.data());
-            res = res &&
-                  ck::utils::check_err(
-                      output.mData, host_output.mData, "Error: incorrect results!", atol, rtol);
-            hipGetErrorString(
-                hipMemset(out_device_buf.GetDeviceBuffer(), 0, out_device_buf.mMemSize));
+
+        return std::make_tuple(std::move(input), std::move(weights));
    }
+
+    virtual TensorPtr<OutDataType> GetOutputTensor() const override
+    {
+        std::vector<std::size_t> output_dims{static_cast<std::size_t>(params_.N),
+                                             static_cast<std::size_t>(params_.K)};
+        output_dims.insert(std::end(output_dims),
+                           std::begin(output_spatial_lengths_),
+                           std::end(output_spatial_lengths_));
+        auto output = std::make_unique<Tensor<OutDataType>>(
+            get_host_tensor_descriptor(output_dims, OutLayout{}));
+
+        if(do_init_)
+        {
+            std::fill(output->begin(), output->end(), OutDataType(0.f));
        }
-    return res;
-}
+        return output;
+    }
+
+    virtual std::unique_ptr<tensor_operation::device::BaseInvoker>
+    MakeInvokerPointer(tensor_operation::device::BaseOperator* op_ptr) const override
+    {
+        static_assert(
+            std::is_same_v<InElementwiseOp, ck::tensor_operation::element_wise::PassThrough>);
+        static_assert(
+            std::is_same_v<OutElementwiseOp, ck::tensor_operation::element_wise::PassThrough>);
+        static_assert(
+            std::is_same_v<WeiElementwiseOp, ck::tensor_operation::element_wise::PassThrough>);
+
+        auto conv_ptr = dynamic_cast<DeviceConvFwdOp*>(op_ptr);
+        if(!conv_ptr)
+        {
+            throw std::runtime_error(
+                "[ConvFwdOpInstance]: couldn't cast op_ptr to DeviceConvFwdNoOpPtr type!");
+        }
+        return conv_ptr->MakeInvokerPointer();
+    }
+
+    virtual std::unique_ptr<tensor_operation::device::BaseArgument>
+    MakeArgumentPointer(tensor_operation::device::BaseOperator* op_ptr,
+                        const DeviceBuffers& in_device_buffers,
+                        const DeviceMemPtr& out_device_buffer) const override
+    {
+        static_assert(
+            std::is_same_v<InElementwiseOp, ck::tensor_operation::element_wise::PassThrough>);
+        static_assert(
+            std::is_same_v<OutElementwiseOp, ck::tensor_operation::element_wise::PassThrough>);
+        static_assert(
+            std::is_same_v<WeiElementwiseOp, ck::tensor_operation::element_wise::PassThrough>);
+
+        auto conv_ptr = dynamic_cast<DeviceConvFwdOp*>(op_ptr);
+        if(!conv_ptr)
+        {
+            throw std::runtime_error(
+                "[ConvFwdOpInstance]: couldn't cast op_ptr to DeviceConvFwdNoOpPtr type!");
+        }
+
+        return conv_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buffers[0]->GetDeviceBuffer()),
+            static_cast<WeiDataType*>(in_device_buffers[1]->GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buffer->GetDeviceBuffer()),
+            params_.N,
+            params_.K,
+            params_.C,
+            params_.input_spatial_lengths,
+            params_.filter_spatial_lengths,
+            output_spatial_lengths_,
+            params_.conv_filter_strides,
+            params_.conv_filter_dilations,
+            params_.input_left_pads,
+            params_.input_right_pads,
+            InElementwiseOp{},
+            WeiElementwiseOp{},
+            OutElementwiseOp{});
+    }
+
+    virtual std::size_t GetFlops() const override
+    {
+        return get_flops(params_.N,
+                         params_.C,
+                         params_.K,
+                         params_.filter_spatial_lengths,
+                         output_spatial_lengths_);
+    }
+
+    virtual std::size_t GetBtype() const override
+    {
+        return get_btype<InDataType, WeiDataType, OutDataType>(params_.N,
+                                                               params_.C,
+                                                               params_.K,
+                                                               params_.input_spatial_lengths,
+                                                               params_.filter_spatial_lengths,
+                                                               output_spatial_lengths_);
+    }
+
+    private:
+    const ConvParams& params_;
+    const std::vector<ck::index_t> output_spatial_lengths_;
+    const bool do_init_;
+    const InputInitFun& input_init_f_;
+    const WeightsInitFun& weights_init_f_;
+};

 } // namespace conv
 } // namespace utils
 } // namespace ck

-#endif
+std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParams& p);
--- a/library/include/ck/library/utility/fill.hpp
+++ b/library/include/ck/library/utility/fill.hpp
+#pragma once
+
+#include <algorithm>
+#include <random>
+
+#include "data_type.hpp"
+
+namespace ck {
+namespace utils {
+
+// template <typename T, class Enable = void>
+// struct FillUniform;
+
+// TODO: what's wrong with this specialization???
+// err: segmentation fault in mt19937 - infinite loop like.
+// template <typename T>
+// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
+//                                               !std::is_same<T, bhalf_t>::value>::type>
+// {
+//     int a_{0};
+//     int b_{5};
+//     // T a_ = T{0};
+//     // T b_ = T{5};
+
+//     template <typename ForwardIter>
+//     void operator()(ForwardIter first, ForwardIter last) const
+//     {
+//         std::mt19937 gen{11939};
+//         std::uniform_int_distribution<int> dis(a_, b_);
+//         std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+//     }
+// };
+
+// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
+//                                               std::is_same<T, bhalf_t>::value>::type>
+template <typename T>
+struct FillUniform
+{
+    float a_{0};
+    float b_{5};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::mt19937 gen{11939};
+        std::uniform_real_distribution<> dis(a_, b_);
+        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+    }
+};
+
+template <typename T>
+struct FillMonotonicSeq
+{
+    T init_value_{0};
+    T step_{1};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::generate(first, last, [=, n = init_value_]() mutable {
+            auto tmp = n;
+            n += step_;
+            return tmp;
+        });
+    }
+};
+
+template <typename T>
+struct FillConstant
+{
+    T value_{0};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::fill(first, last, value_);
+    }
+};
+
+} // namespace utils
+} // namespace ck
--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
+#pragma once
+
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "check_err.hpp"
+#include "device_base.hpp"
+#include "functional2.hpp"
+
+namespace ck {
+namespace utils {
+
+struct ProfileBestConfig
+{
+    std::string best_op_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_tflops     = std::numeric_limits<float>::max();
+    float best_gb_per_sec = std::numeric_limits<float>::max();
+};
+
+/**
+ * @brief      This class describes an operation instance(s).
+ *
+ *             Op instance defines a particular specializations of operator
+ *             template. Thanks to this specific input/output data types, data
+ *             layouts and modifying elementwise operations it is able to create
+ *             it's input/output tensors, provide pointers to instances which
+ *             can execute it and all operation specific parameters.
+ */
+template <typename OutDataType, typename... InArgTypes>
+class OpInstance
+{
+    public:
+    template <typename T>
+    using TensorPtr      = std::unique_ptr<Tensor<T>>;
+    using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
+    using DeviceMemPtr   = std::unique_ptr<DeviceMem>;
+    using DeviceBuffers  = std::vector<DeviceMemPtr>;
+
+    OpInstance()                  = default;
+    OpInstance(const OpInstance&) = default;
+    OpInstance& operator=(const OpInstance&) = default;
+    virtual ~OpInstance(){};
+
+    virtual InTensorsTuple GetInputTensors() const         = 0;
+    virtual TensorPtr<OutDataType> GetOutputTensor() const = 0;
+    virtual std::unique_ptr<tensor_operation::device::BaseInvoker>
+    MakeInvokerPointer(tensor_operation::device::BaseOperator*) const = 0;
+    virtual std::unique_ptr<tensor_operation::device::BaseArgument>
+    MakeArgumentPointer(tensor_operation::device::BaseOperator*,
+                        const DeviceBuffers&,
+                        const DeviceMemPtr&) const = 0;
+    virtual std::size_t GetFlops() const           = 0;
+    virtual std::size_t GetBtype() const           = 0;
+};
+
+/**
+ * @brief      A generic operation instance run engine.
+ */
+template <typename OutDataType, typename... InArgTypes>
+class OpInstanceRunEngine
+{
+    public:
+    using OpInstanceT = OpInstance<InArgTypes..., OutDataType>;
+    template <typename T>
+    using TensorPtr        = std::unique_ptr<Tensor<T>>;
+    using DeviceMemPtr     = std::unique_ptr<DeviceMem>;
+    using InTensorsTuple   = std::tuple<TensorPtr<InArgTypes>...>;
+    using DeviceBuffers    = std::vector<DeviceMemPtr>;
+    using InArgsTypesTuple = std::tuple<InArgTypes...>;
+
+    OpInstanceRunEngine() = delete;
+
+    template <typename ReferenceOp = std::function<void()>>
+    OpInstanceRunEngine(const OpInstanceT& op_instance,
+                        const ReferenceOp& reference_op = ReferenceOp{})
+        : op_instance_{op_instance}
+    {
+        in_tensors_ = op_instance_.GetInputTensors();
+        out_tensor_ = op_instance_.GetOutputTensor();
+
+        if constexpr(std::is_invocable_v<ReferenceOp,
+                                         const Tensor<InArgTypes>&...,
+                                         Tensor<OutDataType>&>)
+        {
+            ref_output_ = op_instance_.GetOutputTensor();
+            CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+        }
+        AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
+        out_device_buffer_ =
+            std::make_unique<DeviceMem>(sizeof(OutDataType) * out_tensor_->mDesc.GetElementSpace());
+        out_device_buffer_->SetZero();
+    }
+
+    virtual ~OpInstanceRunEngine(){};
+
+    template <typename OpInstancePtr>
+    bool Test(const std::vector<OpInstancePtr>& op_ptrs)
+    {
+        bool res{true};
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
+            auto argument = op_instance_.MakeArgumentPointer(
+                op_ptr.get(), in_device_buffers_, out_device_buffer_);
+            if(op_ptr->IsSupportedArgument(argument.get()))
+            {
+                invoker->Run(argument.get());
+                out_device_buffer_->FromDevice(out_tensor_->mData.data());
+                if(!ref_output_)
+                {
+                    throw std::runtime_error(
+                        "OpInstanceRunEngine::Test: Reference value not availabe."
+                        " You have to provide reference function.");
+                }
+                // TODO: enable flexible use of custom check_error functions
+                res = res && check_err(out_tensor_->mData, ref_output_->mData);
+                out_device_buffer_->SetZero();
+            }
+        }
+        return res;
+    }
+
+    template <typename OpInstancePtr>
+    ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
+                              int nrepeat          = 100,
+                              bool do_verification = false,
+                              bool do_log          = false)
+    {
+        bool res{true};
+        ProfileBestConfig best_config;
+
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
+            auto argument = op_instance_.MakeArgumentPointer(
+                op_ptr.get(), in_device_buffers_, out_device_buffer_);
+            if(op_ptr->IsSupportedArgument(argument.get()))
+            {
+                std::string op_name = op_ptr->GetTypeString();
+                float avg_time      = invoker->Run(argument.get(), nrepeat);
+
+                std::size_t flops     = op_instance_.GetFlops();
+                std::size_t num_btype = op_instance_.GetBtype();
+                float tflops          = static_cast<float>(flops) / 1.E9 / avg_time;
+                float gb_per_sec      = num_btype / 1.E6 / avg_time;
+
+                std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                          << " GB/s, " << op_name << std::endl;
+
+                if(tflops < best_config.best_tflops)
+                {
+                    best_config.best_op_name    = op_name;
+                    best_config.best_tflops     = tflops;
+                    best_config.best_gb_per_sec = gb_per_sec;
+                    best_config.best_avg_time   = avg_time;
+                }
+
+                if(do_verification)
+                {
+                    out_device_buffer_->FromDevice(out_tensor_->mData.data());
+                    if(!ref_output_)
+                    {
+                        throw std::runtime_error(
+                            "OpInstanceRunEngine::Profile: Reference value not availabe."
+                            " You have to provide reference function.");
+                    }
+                    // TODO: enable flexible use of custom check_error functions
+                    res = res && CheckErr(out_tensor_->mData, ref_output_->mData);
+
+                    if(do_log) {}
+                }
+                out_device_buffer_->SetZero();
+            }
+        }
+        return best_config;
+    }
+
+    void SetAtol(double a) { atol_ = a; }
+    void SetRtol(double r) { rtol_ = r; }
+
+    private:
+    template <typename F, std::size_t... Is>
+    void CallRefOpUnpackArgs(const F& f, std::index_sequence<Is...>) const
+    {
+        f(*std::get<Is>(in_tensors_)..., *ref_output_);
+    }
+
+    template <std::size_t... Is>
+    void AllocateDeviceInputTensors(std::index_sequence<Is...>)
+    {
+        (AllocateDeviceInputTensorsImpl<Is>(), ...);
+    }
+
+    template <std::size_t Index>
+    void AllocateDeviceInputTensorsImpl()
+    {
+        const auto& ts = std::get<Index>(in_tensors_);
+        in_device_buffers_
+            .emplace_back(
+                std::make_unique<DeviceMem>(sizeof(std::tuple_element_t<Index, InArgsTypesTuple>) *
+                                            ts->mDesc.GetElementSpace()))
+            ->ToDevice(ts->mData.data());
+    }
+
+    static constexpr std::size_t kNInArgs_ = std::tuple_size_v<InTensorsTuple>;
+    const OpInstanceT& op_instance_;
+    double rtol_{1e-5};
+    double atol_{1e-8};
+
+    InTensorsTuple in_tensors_;
+    TensorPtr<OutDataType> out_tensor_;
+    TensorPtr<OutDataType> ref_output_;
+
+    DeviceBuffers in_device_buffers_;
+    DeviceMemPtr out_device_buffer_;
+
+    template <typename T>
+    bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
+    {
+        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_);
+    }
+};
+
+} // namespace utils
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -28,19 +28,19 @@ using device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances = std::tuple<
        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    32,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    32,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    32,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    32,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    32,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
    // clang-format on
    >;


--- a/library/src/utility/CMakeLists.txt
+++ b/library/src/utility/CMakeLists.txt
+include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/include/ck
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
+    ${PROJECT_SOURCE_DIR}/include/ck/utility
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/utility
+)
+
+set(CONV_FWD_UTIL_SOURCE
+    conv_fwd_util.cpp
+)
+
+add_library(conv_fwd_util SHARED ${CONV_FWD_UTIL_SOURCE})
+target_link_libraries(conv_fwd_util PRIVATE host_tensor)
+target_compile_features(conv_fwd_util PUBLIC)
+set_target_properties(conv_fwd_util PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(conv_fwd_util SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
+
+clang_tidy_check(conv_fwd_util)
--- a/library/src/utility/conv_fwd_util.cpp
+++ b/library/src/utility/conv_fwd_util.cpp
+
+#include "conv_fwd_util.hpp"
+
+namespace ck {
+namespace utils {
+namespace conv {
+
+/**
+ * @brief      Calculate number of FLOPs for Convolution
+ *
+ * @param[in]  N                       Batch size.
+ * @param[in]  C                       Number of input channels.
+ * @param[in]  K                       Number of output channels.
+ * @param[in]  filter_spatial_lengths  Filter spatial dimensions lengths.
+ * @param[in]  output_spatial_lengths  Convolution output spatial dimensions
+ *                                     lengths.
+ *
+ * @return     The number of flops.
+ */
+std::size_t get_flops(ck::index_t N,
+                      ck::index_t C,
+                      ck::index_t K,
+                      const std::vector<ck::index_t>& filter_spatial_lengths,
+                      const std::vector<ck::index_t>& output_spatial_lengths)
+{
+    // 2 * N * K * <output spatial lengths product> * C * <filter spatial lengths product>
+    return static_cast<std::size_t>(2) * N * K *
+           std::accumulate(std::begin(output_spatial_lengths),
+                           std::end(output_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<std::size_t>()) *
+           C *
+           std::accumulate(std::begin(filter_spatial_lengths),
+                           std::end(filter_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<std::size_t>());
+}
+
+ConvParams::ConvParams()
+        : num_dim_spatial(2),
+          N(128),
+          K(256),
+          C(192),
+          filter_spatial_lengths(2, 3),
+          input_spatial_lengths(2, 71),
+          conv_filter_strides(2, 2),
+          conv_filter_dilations(2, 1),
+          input_left_pads(2, 1),
+          input_right_pads(2, 1)
+{
+}
+
+ConvParams::ConvParams(ck::index_t n_dim,
+                       ck::index_t n_batch,
+                       ck::index_t n_out_channels,
+                       ck::index_t n_in_channels,
+                       const std::vector<ck::index_t>& filters_len,
+                       const std::vector<ck::index_t>& input_len,
+                       const std::vector<ck::index_t>& strides,
+                       const std::vector<ck::index_t>& dilations,
+                       const std::vector<ck::index_t>& left_pads,
+                       const std::vector<ck::index_t>& right_pads)
+    : num_dim_spatial(n_dim),
+      N(n_batch),
+      K(n_out_channels),
+      C(n_in_channels),
+      filter_spatial_lengths(filters_len),
+      input_spatial_lengths(input_len),
+      conv_filter_strides(strides),
+      conv_filter_dilations(dilations),
+      input_left_pads(left_pads),
+      input_right_pads(right_pads)
+{
+    if(ck::type_convert<ck::index_t>(filter_spatial_lengths.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_spatial_lengths.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(conv_filter_strides.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(conv_filter_dilations.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_left_pads.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_right_pads.size()) != num_dim_spatial)
+    {
+        throw(std::runtime_error(
+            "ConvParams::GetOutputSpatialLengths: "
+            "parameter size is different from number of declared dimensions!"));
+    }
+}
+
+std::vector<ck::index_t> ConvParams::GetOutputSpatialLengths() const
+{
+    if(ck::type_convert<ck::index_t>(filter_spatial_lengths.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_spatial_lengths.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(conv_filter_strides.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(conv_filter_dilations.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_left_pads.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_right_pads.size()) != num_dim_spatial)
+    {
+        throw(std::runtime_error(
+            "ConvParams::GetOutputSpatialLengths: "
+            "parameter size is different from number of declared dimensions!"));
+    }
+
+    std::vector<ck::index_t> out_spatial_len(num_dim_spatial, 0);
+    for(ck::index_t i = 0; i < num_dim_spatial; ++i)
+    {
+        // XEff = (X - 1) * conv_dilation_w + 1;
+        // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+        const ck::index_t idx_eff =
+            (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1;
+        out_spatial_len[i] =
+            (input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) /
+                conv_filter_strides[i] +
+            1;
+    }
+    return out_spatial_len;
+}
+
+ConvParams parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
+{
+    ck::utils::conv::ConvParams params;
+
+    params.num_dim_spatial = num_dim_spatial;
+    params.N               = std::stoi(argv[arg_idx++]);
+    params.K               = std::stoi(argv[arg_idx++]);
+    params.C               = std::stoi(argv[arg_idx++]);
+
+    params.filter_spatial_lengths.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.input_spatial_lengths.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.conv_filter_strides.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.conv_filter_dilations.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.input_left_pads.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.input_right_pads.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+    }
+
+    return params;
+}
+
+HostTensorDescriptor get_output_host_tensor_descriptor(const std::vector<std::size_t>& dims,
+                                                       int num_dim_spatial)
+{
+    namespace tl = ck::tensor_layout::convolution;
+
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWK{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWK{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWK{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+
+HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
+                                                        int num_dim_spatial)
+{
+    namespace tl = ck::tensor_layout::convolution;
+
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KZYXC{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KYXC{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KXC{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+
+HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
+                                                      int num_dim_spatial)
+{
+    namespace tl = ck::tensor_layout::convolution;
+
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+
+} // namespace conv
+} // namespace utils
+} // namespace ck
+
+std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParams& p)
+{
+    os << "ConvParams {"
+       << "\nnum_dim_spatial: " << p.num_dim_spatial << "\nN: " << p.N << "\nK: " << p.K
+       << "\nC: " << p.C << "\nfilter_spatial_lengths: " << p.filter_spatial_lengths
+       << "\ninput_spatial_lengths: " << p.input_spatial_lengths
+       << "\nconv_filter_strides: " << p.conv_filter_strides
+       << "\nconv_filter_dilations: " << p.conv_filter_dilations
+       << "\ninput_left_pads: " << p.input_left_pads
+       << "\ninput_right_pads: " << p.input_right_pads;
+    return os;
+}