Merge branch 'feature/integrage-karg-simplification-pr' into feature/test

24672339 · Po-Yen, Chen · f2c5ca5a · 853e797e · 24672339 · f2c5ca5a
Commit 24672339 authored May 25, 2023 by Po-Yen, Chen
20 changed files
--- a/library/include/ck/library/utility/host_tensor.hpp
+++ b/library/include/ck/library/utility/host_tensor.hpp
@@ -411,6 +411,12 @@ struct Tensor
        }
    }

+    template <typename... Is>
+    std::size_t GetOffsetFromMultiIndex(Is... is) const
+    {
+        return mDesc.GetOffsetFromMultiIndex(is...);
+    }
+
    template <typename... Is>
    T& operator()(Is... is)
    {

--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "ck/utility/functional2.hpp"
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-
-namespace ck {
-namespace utils {
-
-struct ProfileBestConfig
-{
-    std::string best_op_name;
-    float best_avg_time   = std::numeric_limits<float>::max();
-    float best_tflops     = std::numeric_limits<float>::max();
-    float best_gb_per_sec = std::numeric_limits<float>::max();
-};
-
-/**
- * @brief      This class describes an operation instance(s).
- *
- *             Op instance defines a particular specializations of operator
- *             template. Thanks to this specific input/output data types, data
- *             layouts and modifying elementwise operations it is able to create
- *             it's input/output tensors, provide pointers to instances which
- *             can execute it and all operation specific parameters.
- */
-template <typename OutDataType, typename... InArgTypes>
-class OpInstance
-{
-    public:
-    template <typename T>
-    using TensorPtr      = std::unique_ptr<Tensor<T>>;
-    using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
-    using DeviceMemPtr   = std::unique_ptr<DeviceMem>;
-    using DeviceBuffers  = std::vector<DeviceMemPtr>;
-
-    OpInstance()                  = default;
-    OpInstance(const OpInstance&) = default;
-    OpInstance& operator=(const OpInstance&) = default;
-    virtual ~OpInstance(){};
-
-    virtual InTensorsTuple GetInputTensors() const         = 0;
-    virtual TensorPtr<OutDataType> GetOutputTensor() const = 0;
-    virtual std::unique_ptr<tensor_operation::device::BaseInvoker>
-    MakeInvokerPointer(tensor_operation::device::BaseOperator*) const = 0;
-    virtual std::unique_ptr<tensor_operation::device::BaseArgument>
-    MakeArgumentPointer(tensor_operation::device::BaseOperator*,
-                        const DeviceBuffers&,
-                        const DeviceMemPtr&) const = 0;
-    virtual std::size_t GetFlops() const           = 0;
-    virtual std::size_t GetBtype() const           = 0;
-};
-
-/**
- * @brief      A generic operation instance run engine.
- */
-template <typename OutDataType, typename... InArgTypes>
-class OpInstanceRunEngine
-{
-    public:
-    using OpInstanceT = OpInstance<InArgTypes..., OutDataType>;
-    template <typename T>
-    using TensorPtr        = std::unique_ptr<Tensor<T>>;
-    using DeviceMemPtr     = std::unique_ptr<DeviceMem>;
-    using InTensorsTuple   = std::tuple<TensorPtr<InArgTypes>...>;
-    using DeviceBuffers    = std::vector<DeviceMemPtr>;
-    using InArgsTypesTuple = std::tuple<InArgTypes...>;
-
-    OpInstanceRunEngine() = delete;
-
-    template <typename ReferenceOp = std::function<void()>>
-    OpInstanceRunEngine(const OpInstanceT& op_instance,
-                        const ReferenceOp& reference_op = ReferenceOp{},
-                        bool do_verification            = true)
-        : op_instance_{op_instance}
-    {
-        in_tensors_ = op_instance_.GetInputTensors();
-        out_tensor_ = op_instance_.GetOutputTensor();
-
-        if constexpr(std::is_invocable_v<ReferenceOp,
-                                         const Tensor<InArgTypes>&...,
-                                         Tensor<OutDataType>&>)
-        {
-            if(do_verification)
-            {
-                ref_output_ = op_instance_.GetOutputTensor();
-                CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
-            }
-        }
-        AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
-        out_device_buffer_ = std::make_unique<DeviceMem>(sizeof(OutDataType) *
-                                                         out_tensor_->mDesc.GetElementSpaceSize());
-        out_device_buffer_->SetZero();
-    }
-
-    virtual ~OpInstanceRunEngine(){};
-
-    template <typename OpInstancePtr>
-    bool Test(const std::vector<OpInstancePtr>& op_ptrs)
-    {
-        bool res{true};
-        for(auto& op_ptr : op_ptrs)
-        {
-            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
-            auto argument = op_instance_.MakeArgumentPointer(
-                op_ptr.get(), in_device_buffers_, out_device_buffer_);
-            if(op_ptr->IsSupportedArgument(argument.get()))
-            {
-                std::cout << "Testing instance: " << op_ptr->GetTypeString() << std::endl;
-                invoker->Run(argument.get());
-                out_device_buffer_->FromDevice(out_tensor_->mData.data());
-                if(!ref_output_)
-                {
-                    throw std::runtime_error(
-                        "OpInstanceRunEngine::Test: Reference value not availabe."
-                        " You have to provide reference function.");
-                }
-                // TODO: enable flexible use of custom check_error functions
-                bool inst_res = CheckErr(out_tensor_->mData, ref_output_->mData);
-                std::cout << (inst_res ? "SUCCESS" : "FAILURE") << std::endl;
-                res = res && inst_res;
-                out_device_buffer_->SetZero();
-            }
-            else
-            {
-                std::cout << "Given conv problem is not supported by instance: \n\t>>>>"
-                          << op_ptr->GetTypeString() << std::endl;
-            }
-        }
-        return res;
-    }
-
-    template <typename OpInstancePtr>
-    ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
-                              bool time_kernel     = false,
-                              bool do_verification = false,
-                              bool do_log          = false)
-    {
-        ProfileBestConfig best_config;
-
-        for(auto& op_ptr : op_ptrs)
-        {
-            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
-            auto argument = op_instance_.MakeArgumentPointer(
-                op_ptr.get(), in_device_buffers_, out_device_buffer_);
-            if(op_ptr->IsSupportedArgument(argument.get()))
-            {
-                std::string op_name = op_ptr->GetTypeString();
-                float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-
-                std::size_t flops     = op_instance_.GetFlops();
-                std::size_t num_btype = op_instance_.GetBtype();
-                float tflops          = static_cast<float>(flops) / 1.E9 / avg_time;
-                float gb_per_sec      = num_btype / 1.E6 / avg_time;
-
-                std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                          << " GB/s, " << op_name << std::endl;
-
-                if(avg_time < best_config.best_avg_time)
-                {
-                    best_config.best_op_name    = op_name;
-                    best_config.best_tflops     = tflops;
-                    best_config.best_gb_per_sec = gb_per_sec;
-                    best_config.best_avg_time   = avg_time;
-                }
-
-                if(do_verification)
-                {
-                    out_device_buffer_->FromDevice(out_tensor_->mData.data());
-                    if(!ref_output_)
-                    {
-                        throw std::runtime_error(
-                            "OpInstanceRunEngine::Profile: Reference value not availabe."
-                            " You have to provide reference function.");
-                    }
-                    // TODO: enable flexible use of custom check_error functions
-                    CheckErr(out_tensor_->mData, ref_output_->mData);
-
-                    if(do_log) {}
-                }
-                out_device_buffer_->SetZero();
-            }
-        }
-        return best_config;
-    }
-
-    void SetAtol(double a) { atol_ = a; }
-    void SetRtol(double r) { rtol_ = r; }
-
-    private:
-    template <typename F, std::size_t... Is>
-    void CallRefOpUnpackArgs(const F& f, std::index_sequence<Is...>) const
-    {
-        f(*std::get<Is>(in_tensors_)..., *ref_output_);
-    }
-
-    template <std::size_t... Is>
-    void AllocateDeviceInputTensors(std::index_sequence<Is...>)
-    {
-        (AllocateDeviceInputTensorsImpl<Is>(), ...);
-    }
-
-    template <std::size_t Index>
-    void AllocateDeviceInputTensorsImpl()
-    {
-        const auto& ts = std::get<Index>(in_tensors_);
-        in_device_buffers_
-            .emplace_back(
-                std::make_unique<DeviceMem>(sizeof(std::tuple_element_t<Index, InArgsTypesTuple>) *
-                                            ts->mDesc.GetElementSpaceSize()))
-            ->ToDevice(ts->mData.data());
-    }
-
-    static constexpr std::size_t kNInArgs_ = std::tuple_size_v<InTensorsTuple>;
-    const OpInstanceT& op_instance_;
-    double rtol_{1e-5};
-    double atol_{1e-8};
-
-    InTensorsTuple in_tensors_;
-    TensorPtr<OutDataType> out_tensor_;
-    TensorPtr<OutDataType> ref_output_;
-
-    DeviceBuffers in_device_buffers_;
-    DeviceMemPtr out_device_buffer_;
-
-    template <typename T>
-    bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
-    {
-        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", rtol_, atol_);
-    }
-};
-
-} // namespace utils
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/CMakeLists.txt
+add_instance_library(device_pool_fwd_instance
+    device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+    device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+    device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+    device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
+    device_max_pool2d_fwd_nhwc_f16_instance.cpp
+    device_max_pool2d_fwd_nhwc_f32_instance.cpp
+    device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+    device_max_pool3d_fwd_ndhwc_f32_instance.cpp
+)
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool3d_fwd_ndhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F16, F16, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool3d_fwd_ndhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F16, ReduceOpId, false>{});
+}
+
+void add_device_pool2d_fwd_nhwc_index_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, true>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F16, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+
+void add_device_pool2d_fwd_nhwc_index_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, ReduceOpId, true>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool3d_fwd_ndhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F16, F16, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F16, F16, I32, F16, ReduceOpId, false>{});
+}
+
+void add_device_pool3d_fwd_ndhwc_index_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F16, F16, I32, ReduceOpId, true>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F16, F16, I32, F16, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool3d_fwd_ndhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+
+void add_device_pool3d_fwd_ndhwc_index_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, ReduceOpId, true>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F32, F32, I32, F32, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I32 = int32_t;
+using F16 = ck::half_t;
+using F32 = float;
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ComputeDataType,
+          ReduceTensorOp ReduceOpId,
+          bool OutputIndex>
+using device_pool2d_fwd_nhwc_instances =
+    // clang-format off
+    std::tuple <
+        DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 1, 1, 1>,
+        DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 2, 1, 2>,
+        DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 4, 1, 4>
+               // clang-format on
+               >;
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ComputeDataType,
+          ReduceTensorOp ReduceOpId,
+          bool OutputIndex>
+using device_pool3d_fwd_ndhwc_instances =
+    // clang-format off
+    std::tuple <
+        DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 1, 1, 1>,
+        DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 2, 1, 2>,
+        DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 4, 1, 4>
+               // clang-format on
+               >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/include/profiler/data_type_enum_helper.hpp
+++ b/profiler/include/profiler/data_type_enum_helper.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma
-
-#include "ck/utility/data_type.hpp"
-#include "profiler/data_type_enum.hpp"
-
-namespace ck {
-
-template <DataTypeEnum DataTypeEnum>
-struct get_datatype_from_enum;
-
-template <>
-struct get_datatype_from_enum<DataTypeEnum::Int8>
-{
-    using type = int8_t;
-};
-
-template <>
-struct get_datatype_from_enum<DataTypeEnum::Int32>
-{
-    using type = int32_t;
-};
-
-template <>
-struct get_datatype_from_enum<DataTypeEnum::Half>
-{
-    using type = half_t;
-};
-
-template <>
-struct get_datatype_from_enum<DataTypeEnum::Float>
-{
-    using type = float;
-};
-
-template <>
-struct get_datatype_from_enum<DataTypeEnum::Double>
-{
-    using type = double;
-};
-
-template <typename T>
-struct get_datatype_enum_from_type;
-
-template <>
-struct get_datatype_enum_from_type<int8_t>
-{
-    static constexpr DataTypeEnum value = DataTypeEnum::Int8;
-};
-
-template <>
-struct get_datatype_enum_from_type<int32_t>
-{
-    static constexpr DataTypeEnum value = DataTypeEnum::Int32;
-};
-
-template <>
-struct get_datatype_enum_from_type<half_t>
-{
-    static constexpr DataTypeEnum value = DataTypeEnum::Half;
-};
-
-template <>
-struct get_datatype_enum_from_type<float>
-{
-    static constexpr DataTypeEnum value = DataTypeEnum::Float;
-};
-
-template <>
-struct get_datatype_enum_from_type<double>
-{
-    static constexpr DataTypeEnum value = DataTypeEnum::Double;
-};
-
-} // namespace ck
--- a/profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/conv_util.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
-
-using F16  = ck::half_t;
-using F32  = float;
-using BF16 = ck::bhalf_t;
-using INT8 = int8_t;
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using DeviceConvBwdDataNoOpPtr =
-    DeviceConvBwdDataPtr<ck::tensor_operation::element_wise::PassThrough,
-                         ck::tensor_operation::element_wise::PassThrough,
-                         ck::tensor_operation::element_wise::PassThrough>;
-void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-
-void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-
-void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-
-namespace ck {
-namespace profiler {
-using DeviceConvBwdDataNoOpPtr = ck::tensor_operation::device::instance::DeviceConvBwdDataNoOpPtr;
-
-template <typename InLayout>
-HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
-                                                      int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-template <typename WeiLayout>
-HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
-                                                        int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-template <typename OutLayout>
-HostTensorDescriptor get_output_host_ensor_descriptor(const std::vector<std::size_t>& dims,
-                                                      int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-template <typename InDataType, typename WeiDataType, typename OutDataType>
-void get_device_conv_bwd_data_op_ptr(
-    InDataType, WeiDataType, OutDataType, std::vector<DeviceConvBwdDataNoOpPtr>&, int)
-{
-    std::cout << "can not find device conv bwd data" << std::endl;
-    exit(1);
-}
-template <>
-void get_device_conv_bwd_data_op_ptr(
-    F32, F32, F32, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-template <>
-void get_device_conv_bwd_data_op_ptr(
-    F16, F16, F16, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-template <>
-void get_device_conv_bwd_data_op_ptr(
-    BF16, BF16, BF16, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-template <>
-void get_device_conv_bwd_data_op_ptr(
-    INT8, INT8, INT8, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-
-template <typename T>
-static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
-{
-    float max_diff = 1e-6;
-
-    for(std::size_t i = 0; i < ref.mData.size(); ++i)
-    {
-        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        if(max_diff < diff)
-        {
-            return false;
-        }
-    }
-    return true;
-}
-template <typename DataType>
-void show_data_nhwc_layout(Tensor<DataType>& nhwc)
-{
-    std::cout << "[";
-    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
-    {
-        std::cout << "[";
-        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
-        {
-            std::cout << "[";
-            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
-            {
-                std::cout << "[";
-                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
-                {
-                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
-                }
-                std::cout << "]";
-            }
-            std::cout << "]";
-        }
-        std::cout << "]";
-    }
-    std::cout << "]";
-}
-
-template <int NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
-          typename OutDataType,
-          typename AccDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout>
-bool profile_convnd_bwd_data_impl(int do_verification,
-                                  int init_method,
-                                  bool do_log,
-                                  bool time_kernel,
-                                  ck::index_t N,
-                                  ck::index_t K,
-                                  ck::index_t C,
-                                  const std::vector<ck::index_t>& input_spatial_lengths,
-                                  const std::vector<ck::index_t>& filter_spatial_lengths,
-                                  const std::vector<ck::index_t>& output_spatial_lengths,
-                                  const std::vector<ck::index_t>& conv_filter_strides,
-                                  const std::vector<ck::index_t>& conv_filter_dilations,
-                                  const std::vector<ck::index_t>& input_left_pads,
-                                  const std::vector<ck::index_t>& input_right_pads)
-{
-    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-    const auto in_element_op  = InElementOp{};
-    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
-
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)};
-    input_dims.insert(
-        std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));
-
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)};
-    filter_dims.insert(std::end(filter_dims),
-                       std::begin(filter_spatial_lengths),
-                       std::end(filter_spatial_lengths));
-
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)};
-    output_dims.insert(std::end(output_dims),
-                       std::begin(output_spatial_lengths),
-                       std::end(output_spatial_lengths));
-
-    Tensor<InDataType> input_host_result(
-        get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial));
-    Tensor<InDataType> input_device_result(
-        get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial));
-    Tensor<WeiDataType> weights(
-        get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
-    Tensor<OutDataType> output(
-        get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
-
-    std::cout << "input: " << input_host_result.mDesc << std::endl;
-    std::cout << "weights: " << weights.mDesc << std::endl;
-    std::cout << "output: " << output.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        output.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-        weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        break;
-    default:
-        output.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
-        weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
-    }
-
-    DeviceMem in_device_buf(sizeof(InDataType) * input_device_result.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
-
-    out_device_buf.ToDevice(output.mData.data());
-    wei_device_buf.ToDevice(weights.mData.data());
-
-    // reset input to zero
-    in_device_buf.SetZero();
-
-    if(do_verification)
-    {
-        auto RunReference = [&](auto& ref_conv) {
-            auto ref_invoker = ref_conv.MakeInvoker();
-
-            auto ref_argument = ref_conv.MakeArgument(input_host_result,
-                                                      weights,
-                                                      output,
-                                                      conv_filter_strides,
-                                                      conv_filter_dilations,
-                                                      input_left_pads,
-                                                      input_right_pads,
-                                                      InElementOp{},
-                                                      WeiElementOp{},
-                                                      OutElementOp{});
-            ref_invoker.Run(ref_argument);
-        };
-
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
-                                                                         WeiDataType,
-                                                                         OutDataType,
-                                                                         AccDataType,
-                                                                         InElementOp,
-                                                                         WeiElementOp,
-                                                                         OutElementOp,
-                                                                         NDimSpatial>();
-        RunReference(ref_conv);
-    }
-
-    // add device Conv instances
-    std::vector<DeviceConvBwdDataNoOpPtr> conv_ptrs;
-    get_device_conv_bwd_data_op_ptr(
-        InDataType{}, WeiDataType{}, OutDataType{}, conv_ptrs, NDimSpatial);
-
-    if(conv_ptrs.size() <= 0)
-    {
-        throw std::runtime_error("wrong! no device Conv instance found");
-    }
-
-    std::string best_conv_name;
-    float best_ave_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
-
-    // profile device Conv instances
-    bool success = true;
-    for(auto& conv_ptr : conv_ptrs)
-    {
-        auto argument_ptr = conv_ptr->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            N,
-            K,
-            C,
-            input_spatial_lengths,
-            filter_spatial_lengths,
-            output_spatial_lengths,
-            conv_filter_strides,
-            conv_filter_dilations,
-            input_left_pads,
-            input_right_pads,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-
-        auto invoker_ptr = conv_ptr->MakeInvokerPointer();
-
-        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            std::string conv_name = conv_ptr->GetTypeString();
-
-            float ave_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-
-            std::size_t flop =
-                ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
-            std::size_t num_btype =
-                ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
-                    N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths);
-
-            float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-            float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                      << " GB/s" << std::endl;
-
-            if(tflops > best_tflops)
-            {
-                best_conv_name  = conv_name;
-                best_tflops     = tflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
-            }
-
-            if(do_verification)
-            {
-                in_device_buf.FromDevice(input_device_result.mData.data());
-
-                if(!check_out(input_host_result, input_device_result))
-                {
-                    std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
-
-                    success = false;
-                }
-                else
-                {
-                    std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
-                }
-
-                success = ck::utils::check_err(input_host_result, input_device_result);
-
-                if(do_log)
-                {
-                    std::cout << "in : ";
-                    show_data_nhwc_layout(output);
-                    std::cout << std::endl;
-
-                    std::cout << "wei: ";
-                    show_data_nhwc_layout(weights);
-                    std::cout << std::endl;
-
-                    std::cout << "out_host  : ";
-                    show_data_nhwc_layout(input_host_result);
-                    std::cout << std::endl;
-
-                    std::cout << "out_device: ";
-                    show_data_nhwc_layout(input_device_result);
-                    std::cout << std::endl;
-                }
-            }
-        }
-    }
-
-    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
-    return success;
-}
-
-} // namespace profiler
-} // namespace ck
--- a/profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp
-#pragma once
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/conv_util.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
-
-using F16  = ck::half_t;
-using F32  = float;
-using BF16 = ck::bhalf_t;
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using DeviceConvndBwdWeightNoOpPtr =
-    DeviceConvBwdWeightPtr<ck::tensor_operation::element_wise::PassThrough,
-                           ck::tensor_operation::element_wise::PassThrough,
-                           ck::tensor_operation::element_wise::PassThrough>;
-
-void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f32_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-
-void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-
-void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_bf16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_bf16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-
-namespace ck {
-namespace profiler {
-
-using DeviceConvndBwdWeightNoOpPtr =
-    ck::tensor_operation::device::instance::DeviceConvndBwdWeightNoOpPtr;
-
-template <typename InLayout>
-HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
-                                                      int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-
-template <typename WeiLayout>
-HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
-                                                        int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-
-template <typename OutLayout>
-HostTensorDescriptor get_output_host_ensor_descriptor(const std::vector<std::size_t>& dims,
-                                                      int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-
-template <typename InDataType, typename WeiDataType, typename OutDataType>
-void get_device_conv_bwd_weight_op_ptr(
-    InDataType, WeiDataType, OutDataType, std::vector<DeviceConvndBwdWeightNoOpPtr>&, int)
-{
-    std::cout << "can not find device conv bwd weight" << std::endl;
-    exit(1);
-}
-
-template <>
-void get_device_conv_bwd_weight_op_ptr(
-    F32, F32, F32, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-
-template <>
-void get_device_conv_bwd_weight_op_ptr(
-    F16, F16, F16, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-
-template <>
-void get_device_conv_bwd_weight_op_ptr(
-    BF16, BF16, BF16, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-
-template <typename DataType>
-void show_data_nhwc_layout(Tensor<DataType>& nhwc)
-{
-    std::cout << "[";
-    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
-    {
-        std::cout << "[";
-        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
-        {
-            std::cout << "[";
-            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
-            {
-                std::cout << "[";
-                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
-                {
-                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
-                }
-                std::cout << "]";
-            }
-            std::cout << "]";
-        }
-        std::cout << "]";
-    }
-    std::cout << "]";
-}
-
-template <int NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
-          typename OutDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout>
-bool profile_convnd_bwd_weight_impl(int do_verification,
-                                    int init_method,
-                                    bool do_log,
-                                    bool time_kernel,
-                                    ck::index_t N,
-                                    ck::index_t K,
-                                    ck::index_t C,
-                                    std::vector<ck::index_t> input_spatial_lengths,
-                                    std::vector<ck::index_t> filter_spatial_lengths,
-                                    std::vector<ck::index_t> output_spatial_lengths,
-                                    std::vector<ck::index_t> conv_filter_strides,
-                                    std::vector<ck::index_t> conv_filter_dilations,
-                                    std::vector<ck::index_t> input_left_pads,
-                                    std::vector<ck::index_t> input_right_pads,
-                                    ck::index_t split_k)
-{
-    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-    const auto in_element_op  = InElementOp{};
-    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
-
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)};
-    input_dims.insert(
-        std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));
-
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)};
-    filter_dims.insert(std::end(filter_dims),
-                       std::begin(filter_spatial_lengths),
-                       std::end(filter_spatial_lengths));
-
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)};
-    output_dims.insert(std::end(output_dims),
-                       std::begin(output_spatial_lengths),
-                       std::end(output_spatial_lengths));
-
-    Tensor<InDataType> input(get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial));
-    Tensor<WeiDataType> weights_host_result(
-        get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
-    Tensor<WeiDataType> weights_device_result(
-        get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
-    Tensor<OutDataType> output(
-        get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
-
-    std::cout << "input: " << input.mDesc << std::endl;
-    std::cout << "weights: " << weights_host_result.mDesc << std::endl;
-    std::cout << "output: " << output.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        input.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
-        output.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
-        break;
-    default:
-        input.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
-        output.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
-    }
-
-    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights_device_result.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
-
-    in_device_buf.ToDevice(input.mData.data());
-    out_device_buf.ToDevice(output.mData.data());
-
-    // reset input to zero
-    wei_device_buf.SetZero();
-
-    if(do_verification)
-    {
-        auto RunReference = [&](auto& ref_conv) {
-            auto ref_invoker = ref_conv.MakeInvoker();
-
-            auto ref_argument = ref_conv.MakeArgument(input,
-                                                      weights_host_result,
-                                                      output,
-                                                      conv_filter_strides,
-                                                      conv_filter_dilations,
-                                                      input_left_pads,
-                                                      input_right_pads,
-                                                      InElementOp{},
-                                                      WeiElementOp{},
-                                                      OutElementOp{});
-            ref_invoker.Run(ref_argument);
-        };
-
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<InDataType,
-                                                                           WeiDataType,
-                                                                           OutDataType,
-                                                                           InElementOp,
-                                                                           WeiElementOp,
-                                                                           OutElementOp,
-                                                                           NDimSpatial>();
-        RunReference(ref_conv);
-    }
-
-    // add device Conv instances
-    std::vector<DeviceConvndBwdWeightNoOpPtr> conv_ptrs;
-    get_device_conv_bwd_weight_op_ptr(
-        InDataType{}, WeiDataType{}, OutDataType{}, conv_ptrs, NDimSpatial);
-
-    if(conv_ptrs.size() <= 0)
-    {
-        throw std::runtime_error("wrong! no device Conv instance found");
-    }
-
-    std::string best_conv_name;
-    float best_ave_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
-
-    // profile device Conv instances
-    bool success = true;
-    for(auto& conv_ptr : conv_ptrs)
-    {
-        // using atomic, so need to reset input, setzero is done in invoker
-        // if(split_k > 1)
-        //{
-        //    wei_device_buf.SetZero();
-        //}
-
-        auto argument_ptr = conv_ptr->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            N,
-            K,
-            C,
-            input_spatial_lengths,
-            filter_spatial_lengths,
-            output_spatial_lengths,
-            conv_filter_strides,
-            conv_filter_dilations,
-            input_left_pads,
-            input_right_pads,
-            in_element_op,
-            wei_element_op,
-            out_element_op,
-            split_k);
-
-        if(!conv_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            std::cout << "wrong! device_conv with the specified compilation parameters does "
-                         "not support this Conv problem"
-                      << std::endl;
-            continue;
-        }
-
-        auto invoker_ptr      = conv_ptr->MakeInvokerPointer();
-        std::string conv_name = conv_ptr->GetTypeString();
-        float ave_time        = 0;
-
-        if(std::is_same<InDataType, ck::bhalf_t>::value && split_k > 1)
-        {
-            // alloc work space
-            size_t bwd_weight_workspace_size = conv_ptr->GetWorkSpaceSize(argument_ptr.get());
-            if(bwd_weight_workspace_size <= 0)
-            {
-                printf("wrong work space size\n");
-                exit(1);
-            }
-            DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
-            wei_work_space_device_buf.SetZero();
-            conv_ptr->SetWorkSpacePointer(argument_ptr.get(),
-                                          wei_work_space_device_buf.GetDeviceBuffer());
-            ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-        }
-        else
-        {
-            ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-        }
-
-        std::size_t flop =
-            ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
-        std::size_t num_btype = ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
-            N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths);
-
-        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-        float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                  << " GB/s" << std::endl;
-
-        if(tflops > best_tflops)
-        {
-            best_conv_name  = conv_name;
-            best_tflops     = tflops;
-            best_ave_time   = ave_time;
-            best_gb_per_sec = gb_per_sec;
-        }
-
-        if(do_verification)
-        {
-            wei_device_buf.FromDevice(weights_device_result.mData.data());
-
-            success = ck::utils::check_err(weights_host_result, weights_device_result);
-
-            if(success == false)
-            {
-                std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
-            }
-            else
-            {
-                std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
-            }
-
-            if(do_log)
-            {
-                std::cout << "in : ";
-                show_data_nhwc_layout(output);
-                std::cout << std::endl;
-
-                std::cout << "wei: ";
-                show_data_nhwc_layout(weights_host_result);
-                std::cout << std::endl;
-
-                std::cout << "out  : ";
-                show_data_nhwc_layout(input);
-                std::cout << std::endl;
-
-                std::cout << "wei_device: ";
-                show_data_nhwc_layout(weights_device_result);
-                std::cout << std::endl;
-            }
-        }
-    }
-
-    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
-    return success;
-}
-
-} // namespace profiler
-} // namespace ck
--- a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename InDataType,
+          typename OutDataType,
+          typename ComputeDataType,
+          typename IndexDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+bool profile_pool2d_fwd_impl(int do_verification,
+                             int init_method,
+                             bool do_log,
+                             bool time_kernel,
+                             std::vector<index_t> in_length, // NCHW
+                             std::vector<index_t> window_spatial_lengths,
+                             std::vector<index_t> window_strides,
+                             std::vector<index_t> input_left_pads,
+                             std::vector<index_t> input_right_pads)
+{
+    constexpr index_t InOutRank  = 4;
+    constexpr index_t WindowRank = 2;
+
+    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
+       window_strides.size() != WindowRank || input_left_pads.size() != WindowRank ||
+       input_right_pads.size() != WindowRank)
+        return false;
+
+    std::vector<index_t> out_length(InOutRank);
+
+    int N = in_length[0];
+    int C = in_length[1];
+
+    out_length[0] = N;
+    out_length[1] = C;
+
+    // Calculate Ho, Wo
+    for(int i = 2; i < InOutRank; ++i)
+    {
+        auto pad1           = input_left_pads[i - 2];
+        auto pad2           = input_right_pads[i - 2];
+        auto windows_size   = window_spatial_lengths[i - 2];
+        auto windows_stride = window_strides[i - 2];
+        out_length[i]       = (in_length[i] + pad1 + pad2 - windows_size) / windows_stride + 1;
+    }
+
+    int Hi = in_length[2];
+    int Wi = in_length[3];
+    int Ho = out_length[2];
+    int Wo = out_length[3];
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+        };
+
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+
+    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+
+    switch(init_method)
+    {
+    case 0: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
+    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
+                                     out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
+                                                                                  WindowRank,
+                                                                                  InDataType,
+                                                                                  OutDataType,
+                                                                                  ComputeDataType,
+                                                                                  IndexDataType,
+                                                                                  ReduceOpId,
+                                                                                  PropagateNan,
+                                                                                  OutputIndex>;
+
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(in_n_c_hi_wi,
+                                             out_n_c_ho_wo_host,
+                                             out_indices_n_c_ho_wo_host,
+                                             window_spatial_lengths,
+                                             window_strides,
+                                             input_left_pads,
+                                             input_right_pads);
+        auto ref_invoker  = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            {C * Hi * Wi, 1, Wi * C, C},
+            {C * Ho * Wo, 1, Wo * C, C},
+            {C * Ho * Wo, 1, Wo * C, C},
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3});
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl;
+            }
+
+            continue;
+        }
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes = in_n_c_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) +
+                                out_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType);
+
+        if constexpr(OutputIndex)
+            num_bytes += out_indices_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(IndexDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
+
+            bool pass = ck::utils::check_err(out_n_c_ho_wo_device.mData,
+                                             out_n_c_ho_wo_host.mData,
+                                             "Error: Incorrect results",
+                                             1e-3,
+                                             1e-3);
+
+            if constexpr(OutputIndex)
+            {
+                out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
+
+                pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device,
+                                                    out_indices_n_c_ho_wo_host);
+            }
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "in_n_c_hi_wi  : ", in_n_c_hi_wi.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_ho_wo_host  : ", out_n_c_ho_wo_host.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_ho_wo_device  : ", out_n_c_ho_wo_device.mData, ",")
+                    << std::endl;
+
+                if constexpr(OutputIndex)
+                    LogRangeAsType<float>(std::cout << "out_indices_n_c_ho_wo_device  : ",
+                                          out_indices_n_c_ho_wo_device.mData,
+                                          ",")
+                        << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", in_length, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename InDataType,
+          typename OutDataType,
+          typename ComputeDataType,
+          typename IndexDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+bool profile_pool3d_fwd_impl(int do_verification,
+                             int init_method,
+                             bool do_log,
+                             bool time_kernel,
+                             std::vector<index_t> in_length, // NCDHW
+                             std::vector<index_t> window_spatial_lengths,
+                             std::vector<index_t> window_strides,
+                             std::vector<index_t> input_left_pads,
+                             std::vector<index_t> input_right_pads)
+{
+    constexpr index_t InOutRank  = 5;
+    constexpr index_t WindowRank = 3;
+
+    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
+       window_strides.size() != WindowRank || input_left_pads.size() != WindowRank ||
+       input_right_pads.size() != WindowRank)
+        return false;
+
+    std::vector<index_t> out_length(InOutRank);
+
+    int N = in_length[0];
+    int C = in_length[1];
+
+    out_length[0] = N;
+    out_length[1] = C;
+
+    // Calculate Do, Ho, Wo
+    for(int i = 2; i < InOutRank; ++i)
+    {
+        auto pad1           = input_left_pads[i - 2];
+        auto pad2           = input_right_pads[i - 2];
+        auto windows_size   = window_spatial_lengths[i - 2];
+        auto windows_stride = window_strides[i - 2];
+        out_length[i]       = (in_length[i] + pad1 + pad2 - windows_size) / windows_stride + 1;
+    }
+
+    int Di = in_length[2];
+    int Hi = in_length[3];
+    int Wi = in_length[4];
+    int Do = out_length[2];
+    int Ho = out_length[3];
+    int Wo = out_length[4];
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t D, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+
+            return HostTensorDescriptor({N_, C_, D, H, W},
+                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+        };
+
+    Tensor<InDataType> in_n_c_di_hi_wi(f_host_tensor_descriptor(N, C, Di, Hi, Wi));
+    Tensor<OutDataType> out_n_c_do_ho_wo_host(f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_do_ho_wo_host(f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+
+    Tensor<OutDataType> out_n_c_do_ho_wo_device(f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_do_ho_wo_device(
+        f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+
+    switch(init_method)
+    {
+    case 0: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
+    case 1: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_di_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_do_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
+                                     out_indices_n_c_do_ho_wo_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_di_hi_wi.mData.data());
+
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
+                                                                                  WindowRank,
+                                                                                  InDataType,
+                                                                                  OutDataType,
+                                                                                  ComputeDataType,
+                                                                                  IndexDataType,
+                                                                                  ReduceOpId,
+                                                                                  PropagateNan,
+                                                                                  OutputIndex>;
+
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(in_n_c_di_hi_wi,
+                                             out_n_c_do_ho_wo_host,
+                                             out_indices_n_c_do_ho_wo_host,
+                                             window_spatial_lengths,
+                                             window_strides,
+                                             input_left_pads,
+                                             input_right_pads);
+        auto ref_invoker  = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C},
+            {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
+            {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3, 4});
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl;
+            }
+
+            continue;
+        }
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes = in_n_c_di_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) +
+                                out_n_c_do_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType);
+
+        if constexpr(OutputIndex)
+            num_bytes +=
+                out_indices_n_c_do_ho_wo_host.mDesc.GetElementSize() * sizeof(IndexDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());
+
+            bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
+                                             out_n_c_do_ho_wo_host.mData,
+                                             "Error: Incorrect results",
+                                             1e-3,
+                                             1e-3);
+
+            if constexpr(OutputIndex)
+            {
+                out_indices_device_buf.FromDevice(out_indices_n_c_do_ho_wo_device.mData.data());
+
+                pass = pass && ck::utils::check_err(out_indices_n_c_do_ho_wo_device,
+                                                    out_indices_n_c_do_ho_wo_host);
+            }
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(
+                    std::cout << "in_n_c_di_hi_wi  : ", in_n_c_di_hi_wi.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_do_ho_wo_host  : ", out_n_c_do_ho_wo_host.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_do_ho_wo_device  : ", out_n_c_do_ho_wo_device.mData, ",")
+                    << std::endl;
+
+                if constexpr(OutputIndex)
+                    LogRangeAsType<float>(std::cout << "out_indices_n_c_do_ho_wo_device  : ",
+                                          out_indices_n_c_do_ho_wo_device.mData,
+                                          ",")
+                        << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", in_length, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -25,6 +25,8 @@ set(PROFILER_SOURCES
    profile_reduce.cpp
    profile_groupnorm.cpp
    profile_layernorm.cpp
+    profile_avg_pool2d_fwd.cpp
+    profile_max_pool3d_fwd.cpp
    profile_softmax.cpp
    profile_batchnorm_fwd.cpp
    profile_batchnorm_bwd.cpp
@@ -74,4 +76,6 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance)
+
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
--- a/profiler/src/profile_avg_pool2d_fwd.cpp
+++ b/profiler/src/profile_avg_pool2d_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_pool2d_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+struct avgPoolFwdArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {
+        {"length", {}}, {"wsize", {}}, {"wstride", {}}, {"pad1", {}}, {"pad2", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help_avg_pool2d_fwd()
+{
+    std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
+              << "arg2: verification (0: no; 1: yes)\n"
+              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg4: print tensor value (0: no; 1: yes)\n"
+              << "arg5: time kernel (0=no, 1=yes)\n"
+              << "--length: input tensor length for NDHW(e.g, --length 2 32 30 30) \n"
+              << "--wsize: window size for YX (e.g, --wsize 2 2) \n"
+              << "--wstride: window stride for HW (e.g, --wstride 2 2) \n"
+              << "--pad1: left side of padding in HW (e.g, --pad1 1 1) \n"
+              << "--pad2: right side of padding in HW (e.g, --pad2 1 1) \n"
+              << "eg: ckProfiler avg_pool2d_fwd 0 1 2 0 1 0 --length 2 32 30 30 --wsize 2 2 "
+                 "--wstride 2 2 --pad1 1 1 --pad2 1 1"
+              << std::endl;
+}
+
+int profile_avg_pool2d_fwd(int argc, char* argv[])
+{
+    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
+    bool do_verification       = true;
+    int init_method            = 0;
+    bool do_log                = false;
+    bool time_kernel           = true;
+
+    std::vector<index_t> in_length = {2, 32, 30, 30};
+    std::vector<index_t> wsize     = {2, 2};
+    std::vector<index_t> wstride   = {2, 2};
+    std::vector<index_t> pad1      = {1, 1};
+    std::vector<index_t> pad2      = {1, 1};
+
+    if(argc != 2 && argc != 25)
+    {
+        print_help_avg_pool2d_fwd();
+        return 0;
+    }
+    else if(argc == 25)
+    {
+        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+        do_verification = std::stoi(argv[3]);
+        init_method     = std::stoi(argv[4]);
+        do_log          = std::stoi(argv[5]);
+        time_kernel     = std::stoi(argv[6]);
+
+        // parse the long options
+        avgPoolFwdArgParser arg_parser;
+        arg_parser(argc, argv);
+        in_length = arg_parser.long_opts["length"];
+        wsize     = arg_parser.long_opts["wsize"];
+        wstride   = arg_parser.long_opts["wstride"];
+        pad1      = arg_parser.long_opts["pad1"];
+        pad2      = arg_parser.long_opts["pad2"];
+    }
+
+    using F16                 = ck::half_t;
+    using F32                 = float;
+    using I32                 = int32_t;
+    constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+    if(data_type == ck::DataTypeEnum::Half)
+    {
+        ck::profiler::profile_pool2d_fwd_impl<F16, F16, F32, I32, ReduceOpId, false, false>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            in_length,
+            wsize,
+            wstride,
+            pad1,
+            pad2);
+    }
+    else if(data_type == ck::DataTypeEnum::Float)
+    {
+        ck::profiler::profile_pool2d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, false>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            in_length,
+            wsize,
+            wstride,
+            pad1,
+            pad2);
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION("avg_pool2d_fwd", "avg_pool2d fwd", profile_avg_pool2d_fwd);
--- a/profiler/src/profile_groupnorm.cpp
+++ b/profiler/src/profile_groupnorm.cpp
@@ -64,7 +64,7 @@ int profile_groupnorm(int argc, char* argv[])
    ck::DataTypeEnum data_type  = ck::DataTypeEnum::Half;
    bool do_verification        = false;
    int init_method             = 0;
-    bool do_log                 = 0;
+    bool do_log                 = false;
    bool time_kernel            = 1;
    std::vector<index_t> length = {64, 16, 16, 32, 40};