Merge branch 'develop' into transpose_5d

e1a5137e · arai713 · GitHub · eb57178d · 718065eb · e1a5137e
Unverified Commit e1a5137e authored Sep 19, 2023 by arai713 Committed by GitHub Sep 19, 2023
20 changed files
--- a/library/src/tensor_operation_instance/gpu/max_pool_bwd/max_pool_bwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/max_pool_bwd/max_pool_bwd_instance_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I32  = int32_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+template <typename DOutDataType, typename IndexDataType, typename DInDataType>
+using device_maxpool_bwd_instances =
+    // clang-format off
+    std::tuple <
+        DeviceMaxPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 1>,
+        DeviceMaxPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 2>,
+        DeviceMaxPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>
+               // clang-format on
+               >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
@@ -3,6 +3,10 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
    list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
                                          device_max_pool3d_fwd_ndhwc_f16_instance.cpp)
 endif()
+if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp
+                                          device_max_pool3d_fwd_ndhwc_bf16_instance.cpp)
+endif()
 if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
    list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
                                          device_max_pool3d_fwd_ndhwc_f32_instance.cpp)

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool3d_fwd_ndhwc_bf16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<5, 3, BF16, BF16, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<BF16, BF16, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool3d_fwd_ndhwc_bf16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<5, 3, BF16, BF16, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<BF16, BF16, I32, BF16, ReduceOpId, false>{});
+}
+
+void add_device_pool3d_fwd_ndhwc_index_bf16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<5, 3, BF16, BF16, I32, NDHWC, NDHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<BF16, BF16, I32, BF16, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
@@ -17,6 +17,7 @@ namespace instance {

 using I32   = int32_t;
 using F16   = ck::half_t;
+using BF16  = ck::bhalf_t;
 using F32   = float;
 using NDHWC = ck::tensor_layout::convolution::NDHWC;


--- a/library/src/utility/device_memory.cpp
+++ b/library/src/utility/device_memory.cpp
@@ -37,6 +37,11 @@ void DeviceMem::ToDevice(const void* p) const
    }
 }

+void DeviceMem::ToDevice(const void* p, const std::size_t cpySize) const
+{
+    hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), cpySize, hipMemcpyHostToDevice));
+}
+
 void DeviceMem::FromDevice(void* p) const
 {
    if(mpDeviceBuf)
@@ -49,6 +54,11 @@ void DeviceMem::FromDevice(void* p) const
    }
 }

+void DeviceMem::FromDevice(void* p, const std::size_t cpySize) const
+{
+    hip_check_error(hipMemcpy(p, mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
+}
+
 void DeviceMem::SetZero() const
 {
    if(mpDeviceBuf)

--- a/profiler/README.md
+++ b/profiler/README.md
@@ -184,3 +184,41 @@ tflops: 95.337
 GB/s: 69.2301
 ```
 Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
+
+## Profile image to column kernels
+```bash
+# arg1: tensor operation (" OP_NAME ": " OP_DESC ")
+# arg2: data type (0: Input fp32, Weight fp32, Output fp32
+#                  1: Input fp16, Weight fp16, Output fp16
+#                  2: Input bf16, Weight bf16, Output bf16
+#                  3: Input int8, Weight int8, Output int8)
+# arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])
+# arg4: verification (0: no, 1: yes)
+# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
+# arg6: print tensor value (0: no; 1: yes)
+# arg7: time kernel (0: no, 1: yes)
+# Following arguments (depending on number of spatial dims):
+#  Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
+#  G, N, K, C, 
+#  <filter spatial dimensions>, (ie Y, X for 2D)
+#  <input image spatial dimensions>, (ie Hi, Wi for 2D)
+#  <strides>, (ie Sy, Sx for 2D)
+#  <dilations>, (ie Dy, Dx for 2D)
+#  <left padding>, (ie LeftPy, LeftPx for 2D)
+#  <right padding>, (ie RightPy, RightPx for 2D)
+
+ ################             op   datatype  layout  verify  init  log  time  Ndims  G   N   K   C  Y  X  Hi  Wi  Sy  Sx  Dy  Dx  LeftPy  LeftPx  RightPy  RightPx
+./bin/ckProfiler image_to_column          0       0       1     1    0     1      2  1 256   1 512  3  3   28  28   1   1   1   1        0       0       0        0
+
+ ```
+
+Result (MI210, FP32, NHWC)
+```
+input: dim 5, lengths {1, 256, 512, 28, 28}, strides {102760448, 401408, 1, 14336, 512}
+output: dim 2, lengths {173056, 4608}, strides {4608, 1}
+....
+Best configuration parameters:
+name: DeviceImageToColumn<128, 32, 64, 4>
+avg_time: 3.12326
+GB/s: 2042.59
+```
--- a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
+#include "ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename TensorLayout>
+std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_,
+                                                ck::index_t C_,
+                                                ck::index_t D,
+                                                ck::index_t H,
+                                                ck::index_t W,
+                                                TensorLayout layout)
+{
+    using namespace ck::literals;
+    (void)N_;
+    if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
+        return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_};
+    else
+        throw std::runtime_error("not supported yet");
+};
+
+template <typename DOutDataType,
+          typename DInDataType,
+          typename ComputeDataType,
+          typename DOutLayout,
+          typename DInLayout>
+bool profile_avg_pool3d_bwd_impl(int do_verification,
+                                 int init_method,
+                                 bool do_log,
+                                 bool time_kernel,
+                                 std::vector<index_t> in_length, // NCDHW
+                                 std::vector<index_t> window_spatial_lengths,
+                                 std::vector<index_t> window_strides,
+                                 std::vector<index_t> window_dilations,
+                                 std::vector<index_t> input_left_pads,
+                                 std::vector<index_t> input_right_pads)
+{
+    constexpr index_t InOutRank  = 5;
+    constexpr index_t WindowRank = 3;
+
+    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
+       window_strides.size() != WindowRank || window_dilations.size() != WindowRank ||
+       input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
+    {
+        std::cout << "Parameter is incorrect" << std::endl;
+        return false;
+    }
+
+    std::vector<index_t> out_length(InOutRank);
+
+    int N = in_length[0];
+    int C = in_length[1];
+
+    out_length[0] = N;
+    out_length[1] = C;
+
+    // Calculate Do, Ho, Wo
+    for(int i = 2; i < InOutRank; ++i)
+    {
+        auto pad1             = input_left_pads[i - 2];
+        auto pad2             = input_right_pads[i - 2];
+        auto windows_size     = window_spatial_lengths[i - 2];
+        auto windows_stride   = window_strides[i - 2];
+        auto windows_dilation = window_dilations[i - 2];
+        auto eff              = (windows_size - 1) * windows_dilation + 1;
+        out_length[i]         = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
+    }
+
+    int Di = in_length[2];
+    int Hi = in_length[3];
+    int Wi = in_length[4];
+    int Do = out_length[2];
+    int Ho = out_length[3];
+    int Wo = out_length[4];
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t D, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+
+            return HostTensorDescriptor({N_, C_, D, H, W},
+                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+        };
+
+    Tensor<DOutDataType> dout_n_c_do_ho_wo(f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+    Tensor<DInDataType> din_n_c_di_hi_wi_device(f_host_tensor_descriptor(N, C, Di, Hi, Wi));
+    Tensor<DInDataType> din_n_c_di_hi_wi_host(f_host_tensor_descriptor(N, C, Di, Hi, Wi));
+
+    switch(init_method)
+    {
+    case 0: dout_n_c_do_ho_wo.GenerateTensorValue(GeneratorTensor_1<DOutDataType>{}); break;
+    case 1: dout_n_c_do_ho_wo.GenerateTensorValue(GeneratorTensor_2<DOutDataType>{-5, 5}); break;
+    default: dout_n_c_do_ho_wo.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem dout_device_buf(sizeof(DOutDataType) * dout_n_c_do_ho_wo.mDesc.GetElementSpaceSize());
+    DeviceMem din_device_buf(sizeof(DInDataType) *
+                             din_n_c_di_hi_wi_device.mDesc.GetElementSpaceSize());
+
+    dout_device_buf.ToDevice(dout_n_c_do_ho_wo.mData.data());
+
+    using DeviceOp = ck::tensor_operation::device::
+        DeviceAvgPoolBwd<3, DOutDataType, DInDataType, DOutLayout, DInLayout>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferencePoolingBwdInstance =
+            ck::tensor_operation::host::ReferenceAvgPoolBwd<3, DInDataType, DOutDataType>;
+
+        ReferencePoolingBwdInstance ref_pooling_bwd;
+        auto ref_pooling_bwd_argument = ref_pooling_bwd.MakeArgument(din_n_c_di_hi_wi_host,
+                                                                     dout_n_c_do_ho_wo,
+                                                                     window_spatial_lengths,
+                                                                     window_strides,
+                                                                     window_dilations,
+                                                                     input_left_pads,
+                                                                     input_right_pads);
+
+        auto ref_invoker = ref_pooling_bwd.MakeInvoker();
+        ref_invoker.Run(ref_pooling_bwd_argument);
+    }
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
+            static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+            {N, C, Do, Ho, Wo},
+            {N, C, Di, Hi, Wi},
+            f_tensor_strides_ncdhw(N, C, Do, Ho, Wo, DOutLayout{}),
+            f_tensor_strides_ncdhw(N, C, Di, Hi, Wi, DInLayout{}),
+            window_spatial_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads);
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "doutput lengths = ", out_length, ", ") << std::endl;
+            }
+
+            continue;
+        }
+
+        din_device_buf.SetZero();
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+        float avg_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes =
+            dout_n_c_do_ho_wo.mDesc.GetElementSize() * sizeof(DOutDataType) +
+            din_n_c_di_hi_wi_device.mDesc.GetElementSize() * sizeof(DInDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            din_device_buf.FromDevice(din_n_c_di_hi_wi_device.mData.data());
+            bool pass = ck::utils::check_err(din_n_c_di_hi_wi_device.mData,
+                                             din_n_c_di_hi_wi_host.mData,
+                                             "Error: Incorrect results",
+                                             1e-3,
+                                             1e-3);
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(
+                    std::cout << "din_n_c_di_hi_wi_device: ", din_n_c_di_hi_wi_device.mData, ",")
+                    << std::endl;
+
+                LogRangeAsType<float>(
+                    std::cout << "din_n_c_di_hi_wi_host: ", din_n_c_di_hi_wi_host.mData, ",")
+                    << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "doutput lengths = [", out_length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", out_length, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout>
+bool profile_gemm_multiply_add_impl(int do_verification,
+                                    int init_method,
+                                    bool /*do_log*/,
+                                    bool time_kernel,
+                                    int M,
+                                    int N,
+                                    int K,
+                                    int StrideA,
+                                    int StrideB,
+                                    int StrideD0,
+                                    int StrideD1,
+                                    int StrideE)
+{
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-1, 1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 0.2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.1, 0.1});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+    }
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using MultiplyAdd = ck::tensor_operation::element_wise::MultiplyAdd;
+
+    using AElementOp   = PassThrough;
+    using BElementOp   = PassThrough;
+    using CDEElementOp = MultiplyAdd;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
+                                                          BLayout,
+                                                          ck::Tuple<D0Layout, D1Layout>,
+                                                          ELayout,
+                                                          ADataType,
+                                                          BDataType,
+                                                          ck::Tuple<D0DataType, D1DataType>,
+                                                          EDataType,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          CDEElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // run reference
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_m_n_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_m_n_device_buf.ToDevice(d1_m_n.mData.data());
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    bool pass = true;
+
+    // profile device operation instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
+                                       d1_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 2>{StrideD0, StrideD1},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init E to zero before profiling a kernel
+            e_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -94,7 +94,6 @@ bool profile_gemm_splitk_impl(int do_verification,

    a_device_buf.ToDevice(a_m_k.mData.data());
    b_device_buf.ToDevice(b_k_n.mData.data());
-    c_device_buf.SetZero();

    using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<ALayout,
                                                                    BLayout,
@@ -136,77 +135,118 @@ bool profile_gemm_splitk_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;

    // profile device GEMM instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                                        M,
-                                        N,
-                                        K,
-                                        StrideA,
-                                        StrideB,
-                                        StrideC,
-                                        a_element_op,
-                                        b_element_op,
-                                        c_element_op,
-                                        KBatch);
-
-        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        std::vector<int> kbatch_list = {1,  2,  4,  8,  12, 16,  20,  24,  32,  36,  40, 60,
+                                        64, 72, 80, 88, 96, 128, 144, 160, 176, 192, 256};
+
+        if(KBatch > 0)
        {
-            // re-init C to zero before profiling next kernel
-            c_device_buf.SetZero();
+            kbatch_list = {KBatch};
+        }
+
+        for(std::size_t i = 0; i < kbatch_list.size(); i++)
+        {
+            auto kbatch_curr = kbatch_list[i];
+
+            auto argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            StrideA,
+                                            StrideB,
+                                            StrideC,
+                                            a_element_op,
+                                            b_element_op,
+                                            c_element_op,
+                                            kbatch_curr);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {

-            std::string op_name = op_ptr->GetTypeString();
+                // re-init C to zero before profiling next kernel
+                c_device_buf.SetZero();

-            float ave_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});

-            std::size_t flop = std::size_t(2) * M * N * K;
+                if(do_verification)
+                {
+                    c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+                    pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                        LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                            << std::endl;
+                    }
+                }

-            std::size_t num_btype =
-                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+                std::string op_name = op_ptr->GetTypeString();

-            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+                float ave_time =
+                    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-            float gb_per_sec = num_btype / 1.E6 / ave_time;
+                std::size_t flop = std::size_t(2) * M * N * K;

-            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+                std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                        sizeof(CDataType) * M * N;

-            if(tflops > best_tflops)
-            {
-                best_op_name    = op_name;
-                best_tflops     = tflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
-            }
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

-            if(do_verification)
-            {
-                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+                float gb_per_sec = num_btype / 1.E6 / ave_time;

-                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
+                          << kbatch_curr << std::endl;

-                if(do_log)
+#if defined CK_ENABLE_FP8
+                // set softer tolerances for fp8
+                if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
+                             is_same_v<CDataType, f8_t>)
                {
-                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
-                        << std::endl;
+                    std::string msg = "Error: Incorrect results!";
+                    double rtol     = 1e-1;
+                    double atol     = 1e-1;
+                    pass            = pass & ck::utils::check_err(
+                                      c_m_n_device_result, c_m_n_host_result, msg, rtol, atol);
+                }
+                else
+                {
+#endif
+                    pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+#if defined CK_ENABLE_FP8
+                }
+#endif
+
+                if(tflops > best_tflops)
+                {
+                    best_op_name    = op_name;
+                    best_tflops     = tflops;
+                    best_ave_time   = ave_time;
+                    best_gb_per_sec = gb_per_sec;
+                    best_kbatch     = kbatch_curr;
                }
            }
-        }
-        else
-        {
-            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+            else
+            {
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
+            }
        }
    }

@@ -246,7 +286,7 @@ bool profile_gemm_splitk_impl(int do_verification,
    }

    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
-              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << KBatch
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
              << " GB/s, " << best_op_name << std::endl;


--- a/profiler/include/profiler/profile_gemm_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_streamk_impl.hpp
@@ -170,6 +170,25 @@ bool profile_gemm_streamk_impl(int do_verification,
            // re-init C to zero before profiling next kernel
            c_device_buf.SetZero();

+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+
            std::string op_name = op_ptr->GetTypeString();

            float ave_time =
@@ -194,23 +213,6 @@ bool profile_gemm_streamk_impl(int do_verification,
                best_ave_time   = ave_time;
                best_gb_per_sec = gb_per_sec;
            }
-
-            if(do_verification)
-            {
-                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-
-                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-
-                if(do_log)
-                {
-                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
-                        << std::endl;
-                }
-            }
        }
        else
        {

--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -215,7 +215,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();

-    std::cout << "xdl found " << op_ptrs.size() << " instances" << std::endl;
+    std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;

    for(auto& op_ptr : op_ptrs)
    {

--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -70,6 +70,7 @@ bool profile_grouped_gemm_impl(int do_verification,

    std::vector<Tensor<ADataType>> a_m_k;
    std::vector<Tensor<BDataType>> b_k_n;
+    std::vector<Tensor<CDataType>> c_m_n_host_results;
    std::vector<Tensor<CDataType>> c_m_n_device_results;

    for(std::size_t i = 0; i < group_count; i++)
@@ -81,6 +82,9 @@ bool profile_grouped_gemm_impl(int do_verification,

        c_m_n_device_results.push_back(
            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
+
+        c_m_n_host_results.push_back(
+            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
 #if DEBUG_LOG
        std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
                  << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
@@ -137,7 +141,6 @@ bool profile_grouped_gemm_impl(int do_verification,

        a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
        b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
-        c_device_buf[i]->SetZero();

        gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});

@@ -170,9 +173,36 @@ bool profile_grouped_gemm_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;

    auto p_ds = std::vector<std::array<const void*, 0>>{};

+    if(do_verification)
+    {
+        for(std::size_t i = 0; i < gemm_descs.size(); i++)
+        {
+            using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                    BDataType,
+                                                                                    CDataType,
+                                                                                    AccDataType,
+                                                                                    AElementOp,
+                                                                                    BElementOp,
+                                                                                    CElementOp>;
+
+            auto ref_gemm    = ReferenceGemmInstance{};
+            auto ref_invoker = ref_gemm.MakeInvoker();
+
+            auto ref_argument = ref_gemm.MakeArgument(a_m_k[i],
+                                                      b_k_n[i],
+                                                      c_m_n_host_results[i],
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op);
+
+            ref_invoker.Run(ref_argument);
+        }
+    }
+
    // profile device GEMM instances
    for(auto& gemm_ptr : op_ptrs)
    {
@@ -193,139 +223,135 @@ bool profile_grouped_gemm_impl(int do_verification,
        gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
        std::string gemm_name = gemm_ptr->GetTypeString();

-        if(kbatch > 1)
+        using DeviceOpSplitK = ck::tensor_operation::device::DeviceGroupedGemmSplitK<ALayout,
+                                                                                     BLayout,
+                                                                                     ck::Tuple<>,
+                                                                                     CLayout,
+                                                                                     ADataType,
+                                                                                     BDataType,
+                                                                                     ck::Tuple<>,
+                                                                                     CDataType,
+                                                                                     AElementOp,
+                                                                                     BElementOp,
+                                                                                     CElementOp>;
+
+        // skip non-splitk grouped_gemm
+        if(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) == nullptr)
        {
-            using DeviceOpSplitK =
-                ck::tensor_operation::device::DeviceGroupedGemmSplitK<ALayout,
-                                                                      BLayout,
-                                                                      ck::Tuple<>,
-                                                                      CLayout,
-                                                                      ADataType,
-                                                                      BDataType,
-                                                                      ck::Tuple<>,
-                                                                      CDataType,
-                                                                      AElementOp,
-                                                                      BElementOp,
-                                                                      CElementOp>;
-
-            if(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) != nullptr)
-            {
-                dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
-                    ->SetKBatchSize(argument_ptr.get(), kbatch);
-            }
+            continue;
        }

-        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64};
+
+        if(kbatch > 0)
        {
+            kbatch_list = {kbatch};
+        }

-            float ave_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        for(std::size_t j = 0; j < kbatch_list.size(); j++)
+        {
+
+            auto kbatch_curr = kbatch_list[j];
+
+            dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
+                ->SetKBatchSize(argument_ptr.get(), kbatch_curr);

-            if(time_kernel)
+            if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
            {
-                std::size_t flop = 0, num_btype = 0;
                for(std::size_t i = 0; i < gemm_descs.size(); i++)
-                {
-                    flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
+                    c_device_buf[i]->SetZero();

-                    num_btype += sizeof(ADataType) * Ms[i] * Ks[i] +
-                                 sizeof(BDataType) * Ks[i] * Ns[i] +
-                                 sizeof(CDataType) * Ms[i] * Ns[i];
-                }
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});

-                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+                if(do_verification)
+                {
+                    bool instance_pass = true;
+                    for(std::size_t i = 0; i < gemm_descs.size(); i++)
+                    {

-                float gb_per_sec = num_btype / 1.E6 / ave_time;
-                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
-                          << " TFlops, " << gb_per_sec << " GB/s, " << gemm_name << std::endl;
+                        c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
+
+                        if(std::is_same_v<CDataType, ck::half_t> && kbatch_curr > 1)
+                        {
+                            instance_pass =
+                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
+                                                                      c_m_n_host_results[i],
+                                                                      "Error: Incorrect results!",
+                                                                      0.06);
+                        }
+                        else
+                        {
+                            instance_pass =
+                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
+                                                                      c_m_n_host_results[i]);
+                        }
+
+                        if(do_log)
+                        {
+                            LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(
+                                std::cout << "c_device: ", c_m_n_device_results[i].mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(
+                                std::cout << "c_host  : ", c_m_n_host_results[i].mData, ",")
+                                << std::endl;
+                        }
+                    }

-                if(tflops > best_tflops)
-                {
-                    best_gemm_name  = gemm_name;
-                    best_tflops     = tflops;
-                    best_ave_time   = ave_time;
-                    best_gb_per_sec = gb_per_sec;
-                }
-            }
+                    std::cout << "Instance: " << gemm_name << " verification "
+                              << (instance_pass ? "SUCCEED" : "FAILED") << std::endl;

-            if(do_verification)
-            {
-                bool instance_pass = true;
-                for(std::size_t i = 0; i < gemm_descs.size(); i++)
-                {
+                    pass = pass && instance_pass;
+                }

-                    c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
-                    c_device_buf[i]->SetZero();
+                float ave_time =
+                    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-                    Tensor<CDataType> c_m_n_host_result(
-                        f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}));
-
-                    using ReferenceGemmInstance =
-                        ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                  BDataType,
-                                                                  CDataType,
-                                                                  AccDataType,
-                                                                  AElementOp,
-                                                                  BElementOp,
-                                                                  CElementOp>;
-
-                    auto ref_gemm    = ReferenceGemmInstance{};
-                    auto ref_invoker = ref_gemm.MakeInvoker();
-
-                    auto ref_argument = ref_gemm.MakeArgument(a_m_k[i],
-                                                              b_k_n[i],
-                                                              c_m_n_host_result,
-                                                              a_element_op,
-                                                              b_element_op,
-                                                              c_element_op);
-
-                    ref_invoker.Run(ref_argument);
-                    if(std::is_same_v<CDataType, ck::half_t> && kbatch > 1)
-                    {
-                        instance_pass =
-                            instance_pass && ck::utils::check_err(c_m_n_device_results[i],
-                                                                  c_m_n_host_result,
-                                                                  "Error: Incorrect results!",
-                                                                  0.06);
-                    }
-                    else
+                if(time_kernel)
+                {
+                    std::size_t flop = 0, num_btype = 0;
+                    for(std::size_t i = 0; i < gemm_descs.size(); i++)
                    {
-                        instance_pass =
-                            instance_pass &&
-                            ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
+                        flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
+
+                        num_btype += sizeof(ADataType) * Ms[i] * Ks[i] +
+                                     sizeof(BDataType) * Ks[i] * Ns[i] +
+                                     sizeof(CDataType) * Ms[i] * Ns[i];
                    }

-                    if(do_log)
+                    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+                    float gb_per_sec = num_btype / 1.E6 / ave_time;
+                    std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                              << " TFlops, " << gb_per_sec << " GB/s, " << gemm_name << ", KBatch "
+                              << kbatch_curr << std::endl;
+
+                    if(tflops > best_tflops)
                    {
-                        LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")
-                            << std::endl;
-                        LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",") << std::endl;
-                        LogRangeAsType<float>(
-                            std::cout << "c_device: ", c_m_n_device_results[i].mData, ",")
-                            << std::endl;
-                        LogRangeAsType<float>(
-                            std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
-                            << std::endl;
+                        best_gemm_name  = gemm_name;
+                        best_tflops     = tflops;
+                        best_ave_time   = ave_time;
+                        best_gb_per_sec = gb_per_sec;
+                        best_kbatch     = kbatch_curr;
                    }
                }
-
-                std::cout << "Instance: " << gemm_name << " verification "
-                          << (instance_pass ? "SUCCEED" : "FAILED") << std::endl;
-
-                pass = pass && instance_pass;
            }
-        }
-        else
-        {
-            std::cout << "Instance: " << gemm_name << ", does not support this GEMM problem"
-                      << std::endl;
+            else
+            {
+                std::cout << "Instance: " << gemm_name << ", does not support this GEMM problem"
+                          << std::endl;
+            }
        }
    }

    if(time_kernel)
    {
        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-                  << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
+                  << best_gb_per_sec << " GB/s, " << best_gemm_name << ", KBatch = " << best_kbatch
+                  << std::endl;
    }

    return pass;

--- a/profiler/include/profiler/profile_image_to_column_impl.hpp
+++ b/profiler/include/profiler/profile_image_to_column_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+#include <limits>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+template <index_t NDimSpatial,
+          typename InputLayout,
+          typename InputDataType,
+          typename OutputDataType>
+bool profile_image_to_column_impl(int do_verification,
+                                  int init_method,
+                                  bool do_log,
+                                  bool time_kernel,
+                                  const ck::utils::conv::ConvParam& conv_param)
+{
+    const ck::index_t NDoHoWo =
+        conv_param.N_ *
+        ck::accumulate_n<ck::index_t>(
+            conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+    const ck::index_t CZYX =
+        conv_param.C_ *
+        ck::accumulate_n<ck::index_t>(
+            conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+
+    const auto in_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InputLayout>(
+            conv_param);
+    const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
+
+    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
+    std::array<ck::index_t, 2> output_m_k_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+
+    copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
+    copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
+    copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
+    copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
+    copy(out_desc.GetStrides(), output_m_k_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    Tensor<InputDataType> input(in_desc);
+    Tensor<OutputDataType> host_output(out_desc);
+    Tensor<OutputDataType> device_output(out_desc);
+
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "output: " << host_output.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1: input.GenerateTensorValue(GeneratorTensor_2<InputDataType>{-5, 5}); break;
+    default: input.GenerateTensorValue(GeneratorTensor_3<InputDataType>{0.0, 1.0});
+    }
+
+    DeviceMem in_device_buf(sizeof(InputDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutputDataType) * device_output.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(input.mData.data());
+
+    // run reference op
+    if(do_verification)
+    {
+        auto ref_image_to_column = ck::tensor_operation::host::
+            ReferenceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>{};
+
+        auto ref_invoker  = ref_image_to_column.MakeInvoker();
+        auto ref_argument = ref_image_to_column.MakeArgument(input,
+                                                             host_output,
+                                                             conv_param.filter_spatial_lengths_,
+                                                             conv_param.conv_filter_strides_,
+                                                             conv_param.conv_filter_dilations_,
+                                                             conv_param.input_left_pads_,
+                                                             conv_param.input_right_pads_);
+
+        // init host output to zero
+        host_output.SetZero();
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    using DeviceOp = ck::tensor_operation::device::
+        DeviceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    bool pass                   = true;
+    bool is_supporting_instance = false;
+
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InputDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutputDataType*>(out_device_buf.GetDeviceBuffer()),
+            conv_param.N_,
+            conv_param.C_,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            input_g_n_c_wis_strides,
+            output_m_k_strides,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads);
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            is_supporting_instance = true;
+            // re-init output to zero before profiling next kernel
+            out_device_buf.SetZero();
+            std::string op_name = op_ptr->GetTypeString();
+            auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            std::size_t num_btype =
+                NDoHoWo * CZYX * (sizeof(OutputDataType) + sizeof(InputDataType));
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(avg_time < best_avg_time)
+            {
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(device_output.mData.data());
+                pass = pass & ck::utils::check_err(device_output, host_output);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "host_output  : ", host_output.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best configuration parameters:"
+              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
+              << "\nGB/s: " << best_gb_per_sec << std::endl;
+
+    return is_supporting_instance && pass;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
+#include "ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename DOutDataType,
+          typename DInDataType,
+          bool PropagateNan>
+bool profile_max_pool3d_bwd_impl(int do_verification,
+                                 int init_method,
+                                 bool do_log,
+                                 bool time_kernel,
+                                 std::vector<index_t> in_length, // NCDHW
+                                 std::vector<index_t> window_spatial_lengths,
+                                 std::vector<index_t> window_strides,
+                                 std::vector<index_t> window_dilations,
+                                 std::vector<index_t> input_left_pads,
+                                 std::vector<index_t> input_right_pads)
+{
+    // AtomicAdd only support f32 for now. ComputeDataType must be float32
+    using ComputeDataType = float;
+
+    constexpr index_t InOutRank  = 5;
+    constexpr index_t WindowRank = 3;
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
+       window_strides.size() != WindowRank || window_dilations.size() != WindowRank ||
+       input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
+    {
+        std::cout << "Parameter is incorrect" << std::endl;
+        return false;
+    }
+
+    std::vector<index_t> out_length(InOutRank);
+
+    int N = in_length[0];
+    int C = in_length[1];
+
+    out_length[0] = N;
+    out_length[1] = C;
+
+    // Calculate Do, Ho, Wo
+    for(int i = 2; i < InOutRank; ++i)
+    {
+        auto pad1             = input_left_pads[i - 2];
+        auto pad2             = input_right_pads[i - 2];
+        auto windows_size     = window_spatial_lengths[i - 2];
+        auto windows_stride   = window_strides[i - 2];
+        auto windows_dilation = window_dilations[i - 2];
+        auto eff              = (windows_size - 1) * windows_dilation + 1;
+        out_length[i]         = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
+    }
+
+    int Di = in_length[2];
+    int Hi = in_length[3];
+    int Wi = in_length[4];
+    int Do = out_length[2];
+    int Ho = out_length[3];
+    int Wo = out_length[4];
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t D, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+
+            return HostTensorDescriptor({N_, C_, D, H, W},
+                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+        };
+
+    Tensor<InDataType> in_n_c_di_hi_wi(f_host_tensor_descriptor(N, C, Di, Hi, Wi));
+    Tensor<OutDataType> out_n_c_do_ho_wo(f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_do_ho_wo(f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+    Tensor<DOutDataType> dout_n_c_do_ho_wo(f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+    Tensor<DInDataType> din_n_c_di_hi_wi_host(f_host_tensor_descriptor(N, C, Di, Hi, Wi));
+
+    Tensor<DInDataType> din_n_c_di_hi_wi_device(f_host_tensor_descriptor(N, C, Di, Hi, Wi));
+
+    switch(init_method)
+    {
+    case 0:
+        in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
+        dout_n_c_do_ho_wo.GenerateTensorValue(GeneratorTensor_1<DOutDataType>{});
+        break;
+    case 1:
+        in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        dout_n_c_do_ho_wo.GenerateTensorValue(GeneratorTensor_2<DOutDataType>{-5, 5});
+        break;
+    default:
+        in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
+        dout_n_c_do_ho_wo.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem indices_device_buf(sizeof(IndexDataType) *
+                                 out_indices_n_c_do_ho_wo.mDesc.GetElementSpaceSize());
+    DeviceMem dout_device_buf(sizeof(DOutDataType) * dout_n_c_do_ho_wo.mDesc.GetElementSpaceSize());
+    DeviceMem din_device_buf(sizeof(DInDataType) *
+                             din_n_c_di_hi_wi_device.mDesc.GetElementSpaceSize());
+
+    // Generate index data from forwarding
+    {
+        using ReferencePoolingFwdInstance =
+            ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
+                                                            WindowRank,
+                                                            InDataType,
+                                                            OutDataType,
+                                                            ComputeDataType,
+                                                            IndexDataType,
+                                                            ck::ReduceTensorOp::MAX,
+                                                            false,
+                                                            true>;
+
+        ReferencePoolingFwdInstance ref_pooling_fwd;
+        auto ref_pooling_fwd_argument = ref_pooling_fwd.MakeArgument(in_n_c_di_hi_wi,
+                                                                     out_n_c_do_ho_wo,
+                                                                     out_indices_n_c_do_ho_wo,
+                                                                     window_spatial_lengths,
+                                                                     window_strides,
+                                                                     window_dilations,
+                                                                     input_left_pads,
+                                                                     input_right_pads);
+        auto ref_pooling_fwd_invoker  = ref_pooling_fwd.MakeInvoker();
+        ref_pooling_fwd_invoker.Run(ref_pooling_fwd_argument);
+    }
+
+    indices_device_buf.ToDevice(out_indices_n_c_do_ho_wo.mData.data());
+    dout_device_buf.ToDevice(dout_n_c_do_ho_wo.mData.data());
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceMaxPoolBwd<DOutDataType, IndexDataType, DInDataType>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferencePoolingBwdInstance =
+            ck::tensor_operation::host::ReferenceMaxPoolBwd<DOutDataType,
+                                                            IndexDataType,
+                                                            ComputeDataType,
+                                                            DInDataType,
+                                                            PassThrough>;
+
+        ReferencePoolingBwdInstance ref_pooling_bwd;
+        auto ref_pooling_bwd_argument = ref_pooling_bwd.MakeArgument(
+            dout_n_c_do_ho_wo, out_indices_n_c_do_ho_wo, din_n_c_di_hi_wi_host, PassThrough{});
+        auto ref_invoker = ref_pooling_bwd.MakeInvoker();
+        ref_invoker.Run(ref_pooling_bwd_argument);
+    }
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+            static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+            dout_n_c_do_ho_wo.mDesc.GetElementSpaceSize(),
+            din_n_c_di_hi_wi_device.mDesc.GetElementSpaceSize(),
+            window_spatial_lengths,
+            window_strides,
+            window_dilations);
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "doutput lengths = ", out_length, ", ") << std::endl;
+            }
+
+            continue;
+        }
+
+        size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_device_buf(workspace_sz);
+        inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_device_buf.GetDeviceBuffer());
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+        float avg_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes =
+            dout_n_c_do_ho_wo.mDesc.GetElementSize() * sizeof(DOutDataType) +
+            out_indices_n_c_do_ho_wo.mDesc.GetElementSize() * sizeof(IndexDataType) +
+            din_n_c_di_hi_wi_device.mDesc.GetElementSize() * sizeof(DInDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            din_device_buf.FromDevice(din_n_c_di_hi_wi_device.mData.data());
+
+            bool pass = ck::utils::check_err(din_n_c_di_hi_wi_device.mData,
+                                             din_n_c_di_hi_wi_host.mData,
+                                             "Error: Incorrect results",
+                                             1e-3,
+                                             1e-3);
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(
+                    std::cout << "out_indices_n_c_do_ho_wo: ", out_indices_n_c_do_ho_wo.mData, ",")
+                    << std::endl;
+
+                LogRangeAsType<float>(
+                    std::cout << "din_n_c_di_hi_wi_device: ", din_n_c_di_hi_wi_device.mData, ",")
+                    << std::endl;
+
+                LogRangeAsType<float>(
+                    std::cout << "din_n_c_di_hi_wi_host: ", din_n_c_di_hi_wi_host.mData, ",")
+                    << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "doutput lengths = [", out_length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", out_length, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -5,6 +5,7 @@ set(PROFILER_SOURCES
    profile_gemm_splitk.cpp
    profile_gemm_bias_add_reduce.cpp
    profile_gemm_add_multiply.cpp
+    profile_gemm_multiply_add.cpp
    profile_gemm_reduce.cpp
    profile_batched_gemm.cpp
    profile_batched_gemm_reduce.cpp
@@ -18,6 +19,8 @@ set(PROFILER_SOURCES
    profile_groupnorm.cpp
    profile_layernorm.cpp
    profile_max_pool3d_fwd.cpp
+    profile_avg_pool3d_bwd.cpp
+    profile_max_pool3d_bwd.cpp
    profile_softmax.cpp
    profile_batchnorm_fwd.cpp
    profile_batchnorm_bwd.cpp
@@ -25,6 +28,7 @@ set(PROFILER_SOURCES
    profile_contraction_bilinear.cpp
    profile_contraction_scale.cpp
    profile_grouped_conv_bwd_data.cpp
+    profile_image_to_column.cpp
 )
 if(DL_KERNELS)
  list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
@@ -51,6 +55,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
@@ -74,8 +79,11 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
 if(DL_KERNELS)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
 endif()

--- a/profiler/src/profile_avg_pool3d_bwd.cpp
+++ b/profiler/src/profile_avg_pool3d_bwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_avg_pool3d_bwd_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+struct maxPoolbwdArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
+                                                                   {"wsize", {}},
+                                                                   {"wstride", {}},
+                                                                   {"wdilation", {}},
+                                                                   {"pad1", {}},
+                                                                   {"pad2", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help_avg_pool3d_bwd()
+{
+    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
+              << "arg2: verification (0: no; 1: yes)\n"
+              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg4: print tensor value (0: no; 1: yes)\n"
+              << "arg5: time kernel (0=no, 1=yes)\n"
+              << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
+              << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
+              << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
+              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
+              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
+              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
+              << "eg: ckProfiler avg_pool3d_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
+                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
+              << std::endl;
+}
+
+int profile_avg_pool3d_bwd(int argc, char* argv[])
+{
+    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
+    bool do_verification       = true;
+    int init_method            = 0;
+    bool do_log                = false;
+    bool time_kernel           = true;
+
+    std::vector<index_t> in_length = {2, 32, 30, 30, 30};
+    std::vector<index_t> wsize     = {2, 2, 2};
+    std::vector<index_t> wstride   = {2, 2, 2};
+    std::vector<index_t> wdilation = {1, 1, 1};
+    std::vector<index_t> pad1      = {1, 1, 1};
+    std::vector<index_t> pad2      = {1, 1, 1};
+
+    if(argc != 2 && argc != 33)
+    {
+        print_help_avg_pool3d_bwd();
+        return 0;
+    }
+    else if(argc == 33)
+    {
+        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+        do_verification = std::stoi(argv[3]);
+        init_method     = std::stoi(argv[4]);
+        do_log          = std::stoi(argv[5]);
+        time_kernel     = std::stoi(argv[6]);
+
+        // parse the long options
+        maxPoolbwdArgParser arg_parser;
+        arg_parser(argc, argv);
+        in_length = arg_parser.long_opts["length"];
+        wsize     = arg_parser.long_opts["wsize"];
+        wstride   = arg_parser.long_opts["wstride"];
+        wdilation = arg_parser.long_opts["wdilation"];
+        pad1      = arg_parser.long_opts["pad1"];
+        pad2      = arg_parser.long_opts["pad2"];
+    }
+
+#ifdef CK_ENABLE_FP16
+    using F16 = ck::half_t;
+#endif
+#ifdef CK_ENABLE_BF16
+    using BF16 = ck::bhalf_t;
+#endif
+#ifdef CK_ENABLE_FP32
+    using F32 = float;
+#endif
+    using NDHWC = ck::tensor_layout::convolution::NDHWC;
+
+    if(false)
+        ;
+#ifdef CK_ENABLE_FP16
+    else if(data_type == ck::DataTypeEnum::Half)
+    {
+        ck::profiler::profile_avg_pool3d_bwd_impl<F16, F16, F16, NDHWC, NDHWC>(do_verification,
+                                                                               init_method,
+                                                                               do_log,
+                                                                               time_kernel,
+                                                                               in_length,
+                                                                               wsize,
+                                                                               wstride,
+                                                                               wdilation,
+                                                                               pad1,
+                                                                               pad2);
+    }
+#endif
+#ifdef CK_ENABLE_BF16
+    else if(data_type == ck::DataTypeEnum::BFloat16)
+    {
+        ck::profiler::profile_avg_pool3d_bwd_impl<BF16, BF16, BF16, NDHWC, NDHWC>(do_verification,
+                                                                                  init_method,
+                                                                                  do_log,
+                                                                                  time_kernel,
+                                                                                  in_length,
+                                                                                  wsize,
+                                                                                  wstride,
+                                                                                  wdilation,
+                                                                                  pad1,
+                                                                                  pad2);
+    }
+#endif
+#ifdef CK_ENABLE_FP32
+    else if(data_type == ck::DataTypeEnum::Float)
+    {
+        ck::profiler::profile_avg_pool3d_bwd_impl<F32, F32, F32, NDHWC, NDHWC>(do_verification,
+                                                                               init_method,
+                                                                               do_log,
+                                                                               time_kernel,
+                                                                               in_length,
+                                                                               wsize,
+                                                                               wstride,
+                                                                               wdilation,
+                                                                               pad1,
+                                                                               pad2);
+    }
+#endif
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION("avg_pool3d_bwd", "max_pool bwd", profile_avg_pool3d_bwd);
--- a/profiler/src/profile_batched_gemm_multi_d.cpp
+++ b/profiler/src/profile_batched_gemm_multi_d.cpp
@@ -71,7 +71,7 @@ int profile_batched_gemm_multi_d(int argc, char* argv[])
    const int BatchCount = std::stoi(argv[17]);

    using F16 = ck::half_t;
-#ifdef __int8__
+#ifdef CK_ENABLE_INT8
    using INT8 = int8_t;
 #endif

@@ -165,7 +165,7 @@ int profile_batched_gemm_multi_d(int argc, char* argv[])
    {
        return profile(F16{}, F16{}, F16{}, Col{}, Col{}, Row{});
    }
-#ifdef __int8__
+#ifdef CK_ENABLE_INT8
    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(INT8{}, INT8{}, INT8{}, Row{}, Row{}, Row{});

--- a/profiler/src/profile_conv_bwd_data.cpp
+++ b/profiler/src/profile_conv_bwd_data.cpp
@@ -77,7 +77,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
    using F32  = float;
    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
-#ifdef __int8__
+#ifdef CK_ENABLE_INT8
    using INT8 = int8_t;
 #endif

@@ -140,7 +140,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
        {
            return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, BF16{}, BF16{});
        }
-#ifdef __int8__
+#ifdef CK_ENABLE_INT8
        else if(data_type == ConvDataType::INT8_INT8_INT8)
        {
            return profile(I1, NWC{}, KXC{}, NWK{}, INT8{}, INT8{}, INT8{});
@@ -161,7 +161,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
        {
            return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, BF16{}, BF16{});
        }
-#ifdef __int8__
+#ifdef CK_ENABLE_INT8
        else if(data_type == ConvDataType::INT8_INT8_INT8)
        {
            return profile(I2, NHWC{}, KYXC{}, NHWK{}, INT8{}, INT8{}, INT8{});
@@ -182,7 +182,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
        {
            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, BF16{}, BF16{});
        }
-#ifdef __int8__
+#ifdef CK_ENABLE_INT8
        else if(data_type == ConvDataType::INT8_INT8_INT8)
        {
            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, INT8{}, INT8{}, INT8{});

--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -69,10 +69,10 @@ int profile_gemm(int argc, char* argv[])

    using F32 = float;
    using F16 = ck::half_t;
-#ifdef __bf16__
+#ifdef CK_ENABLE_BF16
    using BF16 = ck::bhalf_t;
 #endif
-#ifdef __int8__
+#ifdef CK_ENABLE_INT8
    using INT8  = int8_t;
    using INT32 = int32_t;
 #endif
@@ -123,7 +123,7 @@ int profile_gemm(int argc, char* argv[])

    if(false)
        ;
-#ifdef __fp32__
+#ifdef CK_ENABLE_FP32
    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(Row{}, Row{}, Row{}, F32{}, F32{}, F32{}, F32{});
@@ -141,7 +141,7 @@ int profile_gemm(int argc, char* argv[])
        return profile(Col{}, Col{}, Row{}, F32{}, F32{}, F32{}, F32{});
    }
 #endif
-#ifdef __fp16__
+#ifdef CK_ENABLE_FP16
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(Row{}, Row{}, Row{}, F16{}, F16{}, F32{}, F16{});
@@ -159,7 +159,7 @@ int profile_gemm(int argc, char* argv[])
        return profile(Col{}, Col{}, Row{}, F16{}, F16{}, F32{}, F16{});
    }
 #endif
-#ifdef __bf16__
+#ifdef CK_ENABLE_BF16
    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(Row{}, Row{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
@@ -177,7 +177,7 @@ int profile_gemm(int argc, char* argv[])
        return profile(Col{}, Col{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
    }
 #endif
-#ifdef __int8__
+#ifdef CK_ENABLE_INT8
    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(Row{}, Row{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});