Merge branch 'develop' into ck_tile/fa_asm_bwd

2a4c2316 · danyao12 · 1e01ee09 · 770d2b77 · 2a4c2316 · 2a4c2316
Commit 2a4c2316 authored Sep 23, 2024 by danyao12
20 changed files
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool2d_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
+}
+void add_device_pool2d_fwd_nhwc_index_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, true>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool2d_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+void add_device_pool2d_fwd_nhwc_index_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, true>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool2d_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+void add_device_pool2d_fwd_nhwc_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F8, F8, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F8, F8, I32, F32, ReduceOpId, false>{});
+}
+void add_device_pool2d_fwd_nhwc_index_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F8, F8, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F8, F8, I32, F32, ReduceOpId, true>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_i8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool2d_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+void add_device_pool2d_fwd_nhwc_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, I8, I8, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<I8, I8, I32, F32, ReduceOpId, false>{});
+}
+void add_device_pool2d_fwd_nhwc_index_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, I8, I8, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<I8, I8, I32, F32, ReduceOpId, true>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/pool2d_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/pool2d_fwd_instance_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+using I32  = int32_t;
+using F32  = float;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F8   = ck::f8_t;
+using NHWC = ck::tensor_layout::convolution::NHWC;
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ComputeDataType,
+          ReduceTensorOp ReduceOpId,
+          bool OutputIndex>
+using device_pool2d_fwd_nhwc_instances =
+    // clang-format off
+    std::tuple <
+        DevicePool2dFwd_NHWC_NHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 1, 1, 1>,
+        DevicePool2dFwd_NHWC_NHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 2, 1, 2>,
+        DevicePool2dFwd_NHWC_NHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 4, 1, 4>
+               // clang-format on
+               >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
 set(DEVICE_POOL3D_FWD_INSTANCES)
 list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
                                        device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+                                        device_max_pool3d_fwd_ndhwc_f8_instance.cpp
+                                        device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
+                                        device_max_pool3d_fwd_ndhwc_i8_instance.cpp
+                                        device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
                                        device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
                                        device_max_pool3d_fwd_ndhwc_f32_instance.cpp
                                        device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "pool_fwd_instance_common.hpp"

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+void add_device_pool3d_fwd_ndhwc_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F32, ReduceOpId, false>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+void add_device_pool3d_fwd_ndhwc_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I32, ReduceOpId, false>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "pool_fwd_instance_common.hpp"

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "pool_fwd_instance_common.hpp"

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+void add_device_pool3d_fwd_ndhwc_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, false>{});
+}
+void add_device_pool3d_fwd_ndhwc_index_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, true>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+void add_device_pool3d_fwd_ndhwc_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I8, ReduceOpId, false>{});
+}
+void add_device_pool3d_fwd_ndhwc_index_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I8, ReduceOpId, true>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -15,6 +15,8 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
+using I8    = int8_t;
+using F8    = ck::f8_t;
 using I32   = int32_t;
 using F16   = ck::half_t;
 using BF16  = ck::bhalf_t;

--- a/profiler/include/profiler/data_type_enum.hpp
+++ b/profiler/include/profiler/data_type_enum.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -14,6 +14,7 @@ enum struct DataTypeEnum
    Int8x4   = 4,
    BFloat16 = 5,
    Double   = 6,
+    Float8   = 7,
    Unknown  = 100,
 };

--- a/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iomanip>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/avg_pool2d_bwd.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp"
+namespace ck {
+namespace profiler {
+template <typename TensorLayout>
+std::vector<ck::index_t> f_tensor_strides_nchw(
+    ck::index_t N, ck::index_t C, ck::index_t H, ck::index_t W, TensorLayout layout)
+{
+    using namespace ck::literals;
+    (void)N;
+    if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NHWC>::value)
+        return {C * H * W, 1_uz, W * C, C};
+    else
+        throw std::runtime_error("not supported yet");
+};
+template <typename DOutDataType, typename DInDataType, typename DOutLayout, typename DInLayout>
+bool profile_avg_pool2d_bwd_impl(int do_verification,
+                                 int init_method,
+                                 bool do_log,
+                                 bool time_kernel,
+                                 std::vector<index_t> in_length,
+                                 std::vector<index_t> window_spatial_lengths,
+                                 std::vector<index_t> window_strides,
+                                 std::vector<index_t> window_dilations,
+                                 std::vector<index_t> input_left_pads,
+                                 std::vector<index_t> input_right_pads)
+{
+    constexpr index_t InOutRank  = 4;
+    constexpr index_t WindowRank = 2;
+    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
+       window_strides.size() != WindowRank || window_dilations.size() != WindowRank ||
+       input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
+    {
+        std::cout << "Parameter is incorrect" << std::endl;
+        return false;
+    }
+    std::vector<index_t> out_length(InOutRank);
+    const int N = in_length[0];
+    const int C = in_length[1];
+    out_length[0] = N;
+    out_length[1] = C;
+    // Calculate Ho, Wo
+    for(unsigned i = 2; i < InOutRank; ++i)
+    {
+        const int idx         = i - 2;
+        auto pad1             = input_left_pads[idx];
+        auto pad2             = input_right_pads[idx];
+        auto windows_size     = window_spatial_lengths[idx];
+        auto windows_stride   = window_strides[idx];
+        auto windows_dilation = window_dilations[idx];
+        auto eff              = (windows_size - 1) * windows_dilation + 1;
+        out_length[i]         = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
+    }
+    const int Hi = in_length[2];
+    const int Wi = in_length[3];
+    const int Ho = out_length[2];
+    const int Wo = out_length[3];
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+        };
+    Tensor<DOutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<DInDataType> in_n_c_hi_wi_device(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<DInDataType> in_n_c_hi_wi_host(f_host_tensor_descriptor(N, C, Hi, Wi));
+    switch(init_method)
+    {
+    case 0: {
+        out_n_c_ho_wo_host.GenerateTensorValue(GeneratorTensor_1<DOutDataType>{});
+        break;
+    }
+    case 1: {
+        out_n_c_ho_wo_host.GenerateTensorValue(GeneratorTensor_2<DOutDataType>{-5, 5});
+        break;
+    }
+    default: {
+        out_n_c_ho_wo_host.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{-0.5, 0.5});
+    }
+    }
+    DeviceMem dout_device_buf(sizeof(DOutDataType) *
+                              out_n_c_ho_wo_host.mDesc.GetElementSpaceSize());
+    DeviceMem din_device_buf(sizeof(DInDataType) * in_n_c_hi_wi_device.mDesc.GetElementSpaceSize());
+    dout_device_buf.ToDevice(out_n_c_ho_wo_host.mData.data());
+    using DeviceOp = ck::tensor_operation::device::
+        DeviceAvgPoolBwd<2, DOutDataType, DInDataType, DOutLayout, DInLayout>;
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    if(do_verification)
+    {
+        using ReferencePoolingBwdInstance =
+            ck::tensor_operation::host::ReferenceAvgPoolBwd<2, DInDataType, DOutDataType>;
+        ReferencePoolingBwdInstance ref_pooling_bwd;
+        auto ref_pooling_bwd_argument = ref_pooling_bwd.MakeArgument(in_n_c_hi_wi_host,
+                                                                     out_n_c_ho_wo_host,
+                                                                     window_spatial_lengths,
+                                                                     window_strides,
+                                                                     window_dilations,
+                                                                     input_left_pads,
+                                                                     input_right_pads);
+        auto ref_invoker = ref_pooling_bwd.MakeInvoker();
+        ref_invoker.Run(ref_pooling_bwd_argument);
+    }
+    int num_kernel      = 0;
+    bool pass           = true;
+    bool instance_found = false;
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
+            static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+            {N, C, Ho, Wo},
+            {N, C, Hi, Wi},
+            f_tensor_strides_nchw(N, C, Ho, Wo, DOutLayout{}),
+            f_tensor_strides_nchw(N, C, Hi, Wi, DInLayout{}),
+            window_spatial_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads);
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+            instance_found = true;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "doutput lengths = ", out_length, ", ") << std::endl;
+            }
+            continue;
+        }
+        din_device_buf.SetZero();
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+        float avg_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        std::size_t num_bytes = out_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(DOutDataType) +
+                                in_n_c_hi_wi_device.mDesc.GetElementSize() * sizeof(DInDataType);
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+        if(time_kernel)
+        {
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+        }
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+        if(do_verification)
+        {
+            din_device_buf.FromDevice(in_n_c_hi_wi_device.mData.data());
+            bool local_pass = ck::utils::check_err(in_n_c_hi_wi_device.mData,
+                                                   in_n_c_hi_wi_host.mData,
+                                                   "Error: Incorrect results",
+                                                   1e-3,
+                                                   1e-3);
+            if(do_log)
+            {
+                LogRangeAsType<float>(
+                    std::cout << "in_n_c_hi_wi_device: ", in_n_c_hi_wi_device.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "in_n_c_hi_wi_host: ", in_n_c_hi_wi_host.mData, ",")
+                    << std::endl;
+            }
+            if(!local_pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "doutput lengths = [", out_length, ", ") << "]." << std::endl;
+                pass &= local_pass;
+            }
+            else
+            {
+                if(time_kernel)
+                {
+                    std::cout << "pass" << std::endl;
+                }
+            }
+        }
+    }
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", out_length, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+    return pass && instance_found;
+}
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -148,6 +148,11 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    bool pass = true;
    auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
+        // workspace_sz will be equal to 0 for other layout than NGCHW
+        const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_dev(workspace_sz);
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            // re-init output to zero before profiling next kernel

--- a/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iomanip>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
+#include "ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
+namespace ck {
+namespace profiler {
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename DOutDataType,
+          typename DInDataType,
+          bool PropagateNan>
+bool profile_max_pool2d_bwd_impl(int do_verification,
+                                 int init_method,
+                                 bool do_log,
+                                 bool time_kernel,
+                                 std::vector<index_t> in_length,
+                                 std::vector<index_t> window_spatial_lengths,
+                                 std::vector<index_t> window_strides,
+                                 std::vector<index_t> window_dilations,
+                                 std::vector<index_t> input_left_pads,
+                                 std::vector<index_t> input_right_pads)
+{
+    // AtomicAdd only support f32 for now. ComputeDataType must be float32
+    using ComputeDataType = float;
+    constexpr index_t InOutRank  = 4;
+    constexpr index_t WindowRank = 2;
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
+       window_strides.size() != WindowRank || window_dilations.size() != WindowRank ||
+       input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
+    {
+        std::cout << "Parameter is incorrect" << std::endl;
+        return false;
+    }
+    std::vector<index_t> out_length(InOutRank);
+    int N = in_length[0];
+    int C = in_length[1];
+    out_length[0] = N;
+    out_length[1] = C;
+    // Calculate Ho, Wo
+    for(unsigned i = 2; i < InOutRank; ++i)
+    {
+        const int idx         = i - 2;
+        auto pad1             = input_left_pads[idx];
+        auto pad2             = input_right_pads[idx];
+        auto windows_size     = window_spatial_lengths[idx];
+        auto windows_stride   = window_strides[idx];
+        auto windows_dilation = window_dilations[idx];
+        auto eff              = (windows_size - 1) * windows_dilation + 1;
+        out_length[i]         = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
+    }
+    int Hi = in_length[2];
+    int Wi = in_length[3];
+    int Ho = out_length[2];
+    int Wo = out_length[3];
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+        };
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<OutDataType> out_n_c_ho_wo(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<DOutDataType> dout_n_c_ho_wo(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<DInDataType> din_n_c_hi_wi_host(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<DInDataType> din_n_c_hi_wi_device(f_host_tensor_descriptor(N, C, Hi, Wi));
+    switch(init_method)
+    {
+    case 0: {
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
+        dout_n_c_ho_wo.GenerateTensorValue(GeneratorTensor_1<DOutDataType>{});
+        break;
+    }
+    case 1: {
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        dout_n_c_ho_wo.GenerateTensorValue(GeneratorTensor_2<DOutDataType>{-5, 5});
+        break;
+    }
+    default: {
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
+        dout_n_c_ho_wo.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{-0.5, 0.5});
+    }
+    }
+    DeviceMem indices_device_buf(sizeof(IndexDataType) *
+                                 out_indices_n_c_ho_wo.mDesc.GetElementSpaceSize());
+    DeviceMem dout_device_buf(sizeof(DOutDataType) * dout_n_c_ho_wo.mDesc.GetElementSpaceSize());
+    DeviceMem din_device_buf(sizeof(DInDataType) *
+                             din_n_c_hi_wi_device.mDesc.GetElementSpaceSize());
+    // Generate index data from forwarding
+    {
+        using ReferencePoolingFwdInstance =
+            ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
+                                                            WindowRank,
+                                                            InDataType,
+                                                            OutDataType,
+                                                            ComputeDataType,
+                                                            IndexDataType,
+                                                            ck::ReduceTensorOp::MAX,
+                                                            false,
+                                                            true>;
+        ReferencePoolingFwdInstance ref_pooling_fwd;
+        auto ref_pooling_fwd_argument = ref_pooling_fwd.MakeArgument(in_n_c_hi_wi,
+                                                                     out_n_c_ho_wo,
+                                                                     out_indices_n_c_ho_wo,
+                                                                     window_spatial_lengths,
+                                                                     window_strides,
+                                                                     window_dilations,
+                                                                     input_left_pads,
+                                                                     input_right_pads);
+        auto ref_pooling_fwd_invoker  = ref_pooling_fwd.MakeInvoker();
+        ref_pooling_fwd_invoker.Run(ref_pooling_fwd_argument);
+    }
+    indices_device_buf.ToDevice(out_indices_n_c_ho_wo.mData.data());
+    dout_device_buf.ToDevice(dout_n_c_ho_wo.mData.data());
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceMaxPoolBwd<DOutDataType, IndexDataType, DInDataType>;
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    if(do_verification)
+    {
+        using ReferencePoolingBwdInstance =
+            ck::tensor_operation::host::ReferenceMaxPoolBwd<DOutDataType,
+                                                            IndexDataType,
+                                                            ComputeDataType,
+                                                            DInDataType,
+                                                            PassThrough>;
+        ReferencePoolingBwdInstance ref_pooling_bwd;
+        auto ref_pooling_bwd_argument = ref_pooling_bwd.MakeArgument(
+            dout_n_c_ho_wo, out_indices_n_c_ho_wo, din_n_c_hi_wi_host, PassThrough{});
+        auto ref_invoker = ref_pooling_bwd.MakeInvoker();
+        ref_invoker.Run(ref_pooling_bwd_argument);
+    }
+    int num_kernel = 0;
+    bool pass           = true;
+    bool instance_found = false;
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+            static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+            dout_n_c_ho_wo.mDesc.GetElementSpaceSize(),
+            din_n_c_hi_wi_device.mDesc.GetElementSpaceSize(),
+            window_spatial_lengths,
+            window_strides,
+            window_dilations);
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+            instance_found = true;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "doutput lengths = ", out_length, ", ") << std::endl;
+            }
+            continue;
+        }
+        size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_device_buf(workspace_sz);
+        inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_device_buf.GetDeviceBuffer());
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+        float avg_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        std::size_t num_bytes =
+            dout_n_c_ho_wo.mDesc.GetElementSize() * sizeof(DOutDataType) +
+            out_indices_n_c_ho_wo.mDesc.GetElementSize() * sizeof(IndexDataType) +
+            din_n_c_hi_wi_device.mDesc.GetElementSize() * sizeof(DInDataType);
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+        if(do_verification)
+        {
+            din_device_buf.FromDevice(din_n_c_hi_wi_device.mData.data());
+            bool local_pass = ck::utils::check_err(din_n_c_hi_wi_device.mData,
+                                                   din_n_c_hi_wi_host.mData,
+                                                   "Error: Incorrect results",
+                                                   1e-3,
+                                                   1e-3);
+            if(do_log)
+            {
+                LogRangeAsType<float>(
+                    std::cout << "out_indices_n_c_ho_wo: ", out_indices_n_c_ho_wo.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "din_n_c_hi_wi_device: ", din_n_c_hi_wi_device.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "din_n_c_hi_wi_host: ", din_n_c_hi_wi_host.mData, ",")
+                    << std::endl;
+            }
+            if(!local_pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "doutput lengths = [", out_length, ", ") << "]." << std::endl;
+                pass &= local_pass;
+            }
+            else
+            {
+                if(time_kernel)
+                {
+                    std::cout << "pass" << std::endl;
+                }
+            }
+        }
+    }
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", out_length, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+    return pass && instance_found;
+}
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iomanip>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+namespace ck {
+namespace profiler {
+template <typename InDataType,
+          typename OutDataType,
+          typename ComputeDataType,
+          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+bool profile_pool2d_fwd_impl(int do_verification,
+                             int init_method,
+                             bool do_log,
+                             bool time_kernel,
+                             std::vector<index_t> in_length, // NCHW
+                             std::vector<index_t> window_spatial_lengths,
+                             std::vector<index_t> window_strides,
+                             std::vector<index_t> window_dilations,
+                             std::vector<index_t> input_left_pads,
+                             std::vector<index_t> input_right_pads)
+{
+    constexpr index_t InOutRank  = 4;
+    constexpr index_t WindowRank = 2;
+    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
+       window_strides.size() != WindowRank || window_dilations.size() != WindowRank ||
+       input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
+        return false;
+    std::vector<index_t> out_length(InOutRank);
+    int N = in_length[0];
+    int C = in_length[1];
+    out_length[0] = N;
+    out_length[1] = C;
+    // Calculate Ho, Wo
+    for(int i = 2; i < InOutRank; ++i)
+    {
+        auto pad1             = input_left_pads[i - 2];
+        auto pad2             = input_right_pads[i - 2];
+        auto windows_size     = window_spatial_lengths[i - 2];
+        auto windows_stride   = window_strides[i - 2];
+        auto windows_dilation = window_dilations[i - 2];
+        auto eff              = (windows_size - 1) * windows_dilation + 1;
+        out_length[i]         = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
+    }
+    int Hi = in_length[2];
+    int Wi = in_length[3];
+    int Ho = out_length[2];
+    int Wo = out_length[3];
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+        };
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+    switch(init_method)
+    {
+    case 0: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
+    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
+                                     out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 InLayout,
+                                                                 OutLayout,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    if(do_verification)
+    {
+        using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
+                                                                                  WindowRank,
+                                                                                  InDataType,
+                                                                                  OutDataType,
+                                                                                  ComputeDataType,
+                                                                                  IndexDataType,
+                                                                                  ReduceOpId,
+                                                                                  PropagateNan,
+                                                                                  OutputIndex>;
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(in_n_c_hi_wi,
+                                             out_n_c_ho_wo_host,
+                                             out_indices_n_c_ho_wo_host,
+                                             window_spatial_lengths,
+                                             window_strides,
+                                             window_dilations,
+                                             input_left_pads,
+                                             input_right_pads);
+        auto ref_invoker  = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+    int num_kernel = 0;
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            {C * Hi * Wi, 1, Wi * C, C},
+            {C * Ho * Wo, 1, Wo * C, C},
+            {C * Ho * Wo, 1, Wo * C, C},
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads,
+            {2, 3});
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl;
+            }
+            continue;
+        }
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        std::size_t num_bytes = in_n_c_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) +
+                                out_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType);
+        if constexpr(OutputIndex)
+            num_bytes += out_indices_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(IndexDataType);
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+        if(do_verification)
+        {
+            out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
+            bool pass = ck::utils::check_err(out_n_c_ho_wo_device.mData,
+                                             out_n_c_ho_wo_host.mData,
+                                             "Error: Incorrect results",
+                                             1e-3,
+                                             1e-3);
+            if constexpr(OutputIndex)
+            {
+                out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
+                pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device,
+                                                    out_indices_n_c_ho_wo_host);
+            }
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "in_n_c_hi_wi  : ", in_n_c_hi_wi.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_ho_wo_host  : ", out_n_c_ho_wo_host.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_ho_wo_device  : ", out_n_c_ho_wo_device.mData, ",")
+                    << std::endl;
+                if constexpr(OutputIndex)
+                    LogRangeAsType<float>(std::cout << "out_indices_n_c_ho_wo_device  : ",
+                                          out_indices_n_c_ho_wo_device.mData,
+                                          ",")
+                        << std::endl;
+            }
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", in_length, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+    return true;
+}
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -17,6 +17,26 @@
 namespace ck {
 namespace profiler {
+struct PoolFwdInputParams
+{
+    int do_verification;
+    int init_method;
+    bool do_log;
+    bool time_kernel;
+    bool return_index;
+    int reduce_op;
+};
+struct PoolFwdKernelParams
+{
+    std::vector<index_t> in_length; // NCDHW
+    std::vector<index_t> window_spatial_lengths;
+    std::vector<index_t> window_strides;
+    std::vector<index_t> window_dilations;
+    std::vector<index_t> input_left_pads;
+    std::vector<index_t> input_right_pads;
+};
 template <typename InDataType,
          typename OutDataType,
          typename ComputeDataType,
@@ -26,29 +46,23 @@ template <typename InDataType,
          ck::ReduceTensorOp ReduceOpId,
          bool PropagateNan,
          bool OutputIndex>
-bool profile_pool3d_fwd_impl(int do_verification,
+bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& kernel_params)
-                             int init_method,
-                             bool do_log,
-                             bool time_kernel,
-                             std::vector<index_t> in_length, // NCDHW
-                             std::vector<index_t> window_spatial_lengths,
-                             std::vector<index_t> window_strides,
-                             std::vector<index_t> window_dilations,
-                             std::vector<index_t> input_left_pads,
-                             std::vector<index_t> input_right_pads)
 {
    constexpr index_t InOutRank  = 5;
    constexpr index_t WindowRank = 3;
-    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
+    if(kernel_params.in_length.size() != InOutRank ||
-       window_strides.size() != WindowRank || window_dilations.size() != WindowRank ||
+       kernel_params.window_spatial_lengths.size() != WindowRank ||
-       input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
+       kernel_params.window_strides.size() != WindowRank ||
+       kernel_params.window_dilations.size() != WindowRank ||
+       kernel_params.input_left_pads.size() != WindowRank ||
+       kernel_params.input_right_pads.size() != WindowRank)
        return false;
    std::vector<index_t> out_length(InOutRank);
-    int N = in_length[0];
+    int N = kernel_params.in_length[0];
-    int C = in_length[1];
+    int C = kernel_params.in_length[1];
    out_length[0] = N;
    out_length[1] = C;
@@ -56,18 +70,18 @@ bool profile_pool3d_fwd_impl(int do_verification,
    // Calculate Do, Ho, Wo
    for(int i = 2; i < InOutRank; ++i)
    {
-        auto pad1             = input_left_pads[i - 2];
+        auto pad1             = kernel_params.input_left_pads[i - 2];
-        auto pad2             = input_right_pads[i - 2];
+        auto pad2             = kernel_params.input_right_pads[i - 2];
-        auto windows_size     = window_spatial_lengths[i - 2];
+        auto windows_size     = kernel_params.window_spatial_lengths[i - 2];
-        auto windows_stride   = window_strides[i - 2];
+        auto windows_stride   = kernel_params.window_strides[i - 2];
-        auto windows_dilation = window_dilations[i - 2];
+        auto windows_dilation = kernel_params.window_dilations[i - 2];
        auto eff              = (windows_size - 1) * windows_dilation + 1;
-        out_length[i]         = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
+        out_length[i] = (kernel_params.in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
    }
-    int Di = in_length[2];
+    int Di = kernel_params.in_length[2];
-    int Hi = in_length[3];
+    int Hi = kernel_params.in_length[3];
-    int Wi = in_length[4];
+    int Wi = kernel_params.in_length[4];
    int Do = out_length[2];
    int Ho = out_length[3];
    int Wo = out_length[4];
@@ -88,7 +102,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
    Tensor<IndexDataType> out_indices_n_c_do_ho_wo_device(
        f_host_tensor_descriptor(N, C, Do, Ho, Wo));
-    switch(init_method)
+    switch(in_params.init_method)
    {
    case 0: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
    case 1: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
@@ -125,7 +139,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
    float best_avg_time   = std::numeric_limits<float>::max();
    float best_gb_per_sec = 0;
-    if(do_verification)
+    if(in_params.do_verification)
    {
        using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
                                                                                  WindowRank,
@@ -141,11 +155,11 @@ bool profile_pool3d_fwd_impl(int do_verification,
        auto ref_argument = ref.MakeArgument(in_n_c_di_hi_wi,
                                             out_n_c_do_ho_wo_host,
                                             out_indices_n_c_do_ho_wo_host,
-                                             window_spatial_lengths,
+                                             kernel_params.window_spatial_lengths,
-                                             window_strides,
+                                             kernel_params.window_strides,
-                                             window_dilations,
+                                             kernel_params.window_dilations,
-                                             input_left_pads,
+                                             kernel_params.input_left_pads,
-                                             input_right_pads);
+                                             kernel_params.input_right_pads);
        auto ref_invoker  = ref.MakeInvoker();
        ref_invoker.Run(ref_argument);
    }
@@ -158,16 +172,16 @@ bool profile_pool3d_fwd_impl(int do_verification,
            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
-            in_length,
+            kernel_params.in_length,
-            window_spatial_lengths,
+            kernel_params.window_spatial_lengths,
            out_length,
            {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C},
            {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
            {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
-            window_strides,
+            kernel_params.window_strides,
-            window_dilations,
+            kernel_params.window_dilations,
-            input_left_pads,
+            kernel_params.input_left_pads,
-            input_right_pads,
+            kernel_params.input_right_pads,
            {2, 3, 4});
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
@@ -176,10 +190,11 @@ bool profile_pool3d_fwd_impl(int do_verification,
        }
        else
        {
-            if(time_kernel)
+            if(in_params.time_kernel)
            {
                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
-                LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl;
+                LogRange(std::cout << "input lengths = ", kernel_params.in_length, ", ")
+                    << std::endl;
            }
            continue;
@@ -187,7 +202,8 @@ bool profile_pool3d_fwd_impl(int do_verification,
        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
-        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        float avg_time =
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, in_params.time_kernel});
        std::size_t num_bytes = in_n_c_di_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) +
                                out_n_c_do_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType);
@@ -198,7 +214,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
        float gb_per_sec = num_bytes / 1.E6 / avg_time;
-        if(time_kernel)
+        if(in_params.time_kernel)
            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
                      << inst_ptr->GetTypeString() << std::endl;
@@ -209,25 +225,25 @@ bool profile_pool3d_fwd_impl(int do_verification,
            best_gb_per_sec    = gb_per_sec;
        }
-        if(do_verification)
+        if(in_params.do_verification)
        {
            out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());
-            bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
+            auto tolerance = 1e-3;
+            bool pass      = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
                                             out_n_c_do_ho_wo_host.mData,
                                             "Error: Incorrect results",
-                                             1e-3,
+                                             tolerance,
-                                             1e-3);
+                                             tolerance);
            if constexpr(OutputIndex)
            {
                out_indices_device_buf.FromDevice(out_indices_n_c_do_ho_wo_device.mData.data());
                pass = pass && ck::utils::check_err(out_indices_n_c_do_ho_wo_device,
                                                    out_indices_n_c_do_ho_wo_host);
            }
-            if(do_log)
+            if(in_params.do_log)
            {
                LogRangeAsType<float>(
                    std::cout << "in_n_c_di_hi_wi  : ", in_n_c_di_hi_wi.mData, ",")
@@ -249,20 +265,21 @@ bool profile_pool3d_fwd_impl(int do_verification,
            if(!pass)
            {
                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
-                LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl;
+                LogRange(std::cout << "lengths = [", kernel_params.in_length, ", ")
+                    << "]." << std::endl;
                return false;
            }
            else
            {
-                if(time_kernel)
+                if(in_params.time_kernel)
                    std::cout << "pass" << std::endl;
            }
        }
    }
-    if(time_kernel)
+    if(in_params.time_kernel)
    {
-        LogRange(std::cout << "length = ", in_length, ",") << std::endl;
+        LogRange(std::cout << "length = ", kernel_params.in_length, ",") << std::endl;
        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
                  << best_instance_name << std::endl;
    }