Rewrite pool2d fwd (#1462)

* added pool2d fwd * add tests * add reviewers changes * Revert "Merge remote-tracking branch 'origin/develop' into jakpiase/pool2d_fwd_new" This reverts commit 6b2ba7ff8960b0a6ddbe30d8dac53eeb55a8597e, reversing changes made to 22c82bea0caf3e0f29399100c1bb67b8003fc042. * Revert "add reviewers changes" This reverts commit 22c82bea0caf3e0f29399100c1bb67b8003fc042. * added reviewers comments * revert some old files * add reviewers requests --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

Rewrite pool2d fwd (#1462)
* added pool2d fwd * add tests * add reviewers changes * Revert "Merge remote-tracking branch 'origin/develop' into jakpiase/pool2d_fwd_new" This reverts commit 6b2ba7ff8960b0a6ddbe30d8dac53eeb55a8597e, reversing changes made to 22c82bea0caf3e0f29399100c1bb67b8003fc042. * Revert "add reviewers changes" This reverts commit 22c82bea0caf3e0f29399100c1bb67b8003fc042. * added reviewers comments * revert some old files * add reviewers requests --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
e8d2887c · jakpiase · GitHub · 2a261afc · e8d2887c · e8d2887c
Unverified Commit e8d2887c authored Sep 11, 2024 by jakpiase Committed by GitHub Sep 11, 2024
16 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp"
+#include <iostream>
+#include <sstream>
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -16,95 +27,359 @@ template <typename InDataType,
          ck::ReduceTensorOp ReduceOpId,
          bool OutputIndex,
          ck::index_t BlockSize,
-          ck::index_t ReduceMThreadClusterSize,
+          ck::index_t MThreadClusterSize,
-          ck::index_t ReduceKThreadClusterSize,
+          ck::index_t KThreadClusterSize,
-          ck::index_t ReduceMThreadSliceSize,
+          ck::index_t MThreadSliceSize,
-          ck::index_t ReduceKThreadSliceSize,
+          ck::index_t KThreadSliceSize,
          ck::index_t InSrcOutDstVectorSize>
-struct DevicePool2dFwd_NHWC_NHWC : public DevicePool3dFwd_NDHWC_NDHWC<InDataType,
+struct DevicePool2dFwd_NHWC_NHWC : public DevicePoolFwd<4,
-                                                                      OutDataType,
+                                                        2,
-                                                                      IndexDataType,
+                                                        InDataType,
-                                                                      ComputeDataType,
+                                                        OutDataType,
-                                                                      ReduceOpId,
+                                                        IndexDataType,
-                                                                      OutputIndex,
+                                                        tensor_layout::convolution::NHWC,
-                                                                      BlockSize,
+                                                        tensor_layout::convolution::NHWC,
-                                                                      ReduceMThreadClusterSize,
+                                                        ReduceOpId,
-                                                                      ReduceKThreadClusterSize,
+                                                        OutputIndex>
-                                                                      ReduceMThreadSliceSize,
-                                                                      ReduceKThreadSliceSize,
-                                                                      InSrcOutDstVectorSize>
 {
-    using DevicePool3D = DevicePool3dFwd_NDHWC_NDHWC<InDataType,
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr index_t InOutRank  = 4;
+    static constexpr index_t WindowRank = 2;
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+    using InElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+    using AccElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+    static constexpr ck::index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr ck::index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+    static auto MakeABGridDescriptor_A_M_K_B_M(std::vector<ck::index_t> input_nchw_lengths,
+                                               std::vector<ck::index_t> output_nchw_lengths,
+                                               std::vector<ck::index_t> input_nchw_stride,
+                                               std::vector<ck::index_t> output_nchw_stride,
+                                               std::vector<ck::index_t> window_spatial_yx_lengths,
+                                               std::vector<ck::index_t> window_yx_strides,
+                                               std::vector<ck::index_t> window_yx_dilations,
+                                               std::vector<ck::index_t> input_left_hw_pads,
+                                               std::vector<ck::index_t> input_right_hw_pads)
+    {
+        const index_t N  = input_nchw_lengths[0];
+        const index_t C  = input_nchw_lengths[1];
+        const index_t Hi = input_nchw_lengths[2];
+        const index_t Wi = input_nchw_lengths[3];
+        const index_t Ho = output_nchw_lengths[2];
+        const index_t Wo = output_nchw_lengths[3];
+        const index_t Y  = window_spatial_yx_lengths[0];
+        const index_t X  = window_spatial_yx_lengths[1];
+        const index_t WindowStrideH = window_yx_strides[0];
+        const index_t WindowStrideW = window_yx_strides[1];
+        const index_t WindowDilationH = window_yx_dilations[0];
+        const index_t WindowDilationW = window_yx_dilations[1];
+        const index_t InLeftPadH = input_left_hw_pads[0];
+        const index_t InLeftPadW = input_left_hw_pads[1];
+        const index_t InRightPadH = input_right_hw_pads[0];
+        const index_t InRightPadW = input_right_hw_pads[1];
+        const index_t MRaw = N * Ho * Wo * C;
+        const index_t MPad = math::integer_least_multiple(MRaw, M_BlockTileSize) - MRaw;
+        const index_t KRaw = Y * X;
+        const index_t KPad = math::integer_least_multiple(KRaw, K_BlockTileSize) - KRaw;
+        // A[ReduceM, ReduceK]
+        const index_t Ni_stride = input_nchw_stride[0];
+        const index_t Ci_stride = input_nchw_stride[1];
+        const index_t Hi_stride = input_nchw_stride[2];
+        const index_t Wi_stride = input_nchw_stride[3];
+        const auto in_grid_desc_n_hi_wi_c = make_naive_tensor_descriptor(
+            make_tuple(N, Hi, Wi, C), make_tuple(Ni_stride, Hi_stride, Wi_stride, Ci_stride));
+        const auto in_grid_desc_n_hip_wip_c = transform_tensor_descriptor(
+            in_grid_desc_n_hi_wi_c,
+            make_tuple(make_pass_through_transform(N),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        const auto in_grid_desc_n_y_ho_x_wo_c = transform_tensor_descriptor(
+            in_grid_desc_n_hip_wip_c,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_embed_transform(make_tuple(Y, Ho), make_tuple(WindowDilationH, WindowStrideH)),
+                make_embed_transform(make_tuple(X, Wo), make_tuple(WindowDilationW, WindowStrideW)),
+                make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+        const auto in_grid_desc_reducemraw_reducekraw =
+            transform_tensor_descriptor(in_grid_desc_n_y_ho_x_wo_c,
+                                        make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, C)),
+                                                   make_merge_transform(make_tuple(Y, X))),
+                                        make_tuple(Sequence<0, 2, 4, 5>{}, Sequence<1, 3>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        const auto in_grid_desc_reducem_reducek = transform_tensor_descriptor(
+            in_grid_desc_reducemraw_reducekraw,
+            make_tuple(make_right_pad_transform(MRaw, MPad), make_right_pad_transform(KRaw, KPad)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        // B[ReduceM]
+        const index_t No_stride = output_nchw_stride[0];
+        const index_t Co_stride = output_nchw_stride[1];
+        const index_t Ho_stride = output_nchw_stride[2];
+        const index_t Wo_stride = output_nchw_stride[3];
+        const auto out_grid_desc_n_ho_wo_c = make_naive_tensor_descriptor(
+            make_tuple(N, Hi, Wi, C), make_tuple(No_stride, Ho_stride, Wo_stride, Co_stride));
+        const auto out_grid_desc_reducemraw =
+            transform_tensor_descriptor(out_grid_desc_n_ho_wo_c,
+                                        make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, C))),
+                                        make_tuple(Sequence<0, 1, 2, 3>{}),
+                                        make_tuple(Sequence<0>{}));
+        const auto out_grid_desc_reducem =
+            transform_tensor_descriptor(out_grid_desc_reducemraw,
+                                        make_tuple(make_right_pad_transform(MRaw, MPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return make_tuple(in_grid_desc_reducem_reducek, out_grid_desc_reducem);
+    }
+    using ABGridDescs =
+        decltype(MakeABGridDescriptor_A_M_K_B_M({}, {}, {}, {}, {}, {}, {}, {}, {}));
+    using AGridDesc_M_K = remove_cvref_t<decltype(ABGridDescs{}[I0])>;
+    using BGridDesc_M   = remove_cvref_t<decltype(ABGridDescs{}[I1])>;
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_dev,
+                 OutDataType* p_out_dev,
+                 IndexDataType* p_out_indices_dev,
+                 std::vector<ck::index_t>& input_nchw_lengths,
+                 std::vector<ck::index_t>& output_nchw_lengths,
+                 std::vector<ck::index_t>& input_nchw_stride,
+                 std::vector<ck::index_t>& output_nchw_stride,
+                 std::vector<ck::index_t>&, // indices_nchw_stride
+                 std::vector<ck::index_t>& window_spatial_yx_lengths,
+                 std::vector<ck::index_t>& window_yx_strides,
+                 std::vector<ck::index_t>& window_yx_dilations,
+                 std::vector<ck::index_t>& input_left_hw_pads,
+                 std::vector<ck::index_t>& input_right_hw_pads)
+            : p_in_dev_{p_in_dev},
+              p_out_dev_{p_out_dev},
+              p_out_indices_dev_{p_out_indices_dev},
+              a_grid_desc_m_k_{},
+              b_grid_desc_m_{},
+              input_nchw_lengths_{input_nchw_lengths},
+              output_nchw_lengths_{output_nchw_lengths},
+              input_nchw_stride_{input_nchw_stride},
+              output_nchw_stride_{output_nchw_stride}
+        {
+            const auto descs = MakeABGridDescriptor_A_M_K_B_M(input_nchw_lengths,
+                                                              output_nchw_lengths,
+                                                              input_nchw_stride,
+                                                              output_nchw_stride,
+                                                              window_spatial_yx_lengths,
+                                                              window_yx_strides,
+                                                              window_yx_dilations,
+                                                              input_left_hw_pads,
+                                                              input_right_hw_pads);
+            a_grid_desc_m_k_ = descs[I0];
+            b_grid_desc_m_   = descs[I1];
+            int32_t reduceLength = window_spatial_yx_lengths[0] * window_spatial_yx_lengths[1];
+            std::tie(in_element_op_, acc_element_op_) =
+                reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);
+        }
+        const InDataType* p_in_dev_;
+        OutDataType* p_out_dev_;
+        IndexDataType* p_out_indices_dev_;
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_M b_grid_desc_m_;
+        InElementwiseOperation in_element_op_;
+        AccElementwiseOperation acc_element_op_;
+        // for checking vector load/store
+        std::vector<ck::index_t> input_nchw_lengths_;
+        std::vector<ck::index_t> output_nchw_lengths_;
+        std::vector<ck::index_t> input_nchw_stride_;
+        std::vector<ck::index_t> output_nchw_stride_;
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            // for NHWC, the dim C is the fastest dimension, and is not reduced.
+            // Hence, it is in M dimension for reduction kernel.
+            static constexpr index_t InSrcOutDstVectorDim = 0; // 0: M, 1: K
+            using gridwise_reduce =
+                GridwiseReduction_mk_to_m_threadwise<InDataType,
                                                     OutDataType,
-                                                     IndexDataType,
                                                     ComputeDataType,
-                                                     ReduceOpId,
+                                                     IndexDataType,
-                                                     OutputIndex,
+                                                     AGridDesc_M_K,
+                                                     BGridDesc_M,
+                                                     ReduceOperation,
+                                                     InElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     InMemoryDataOperationEnum::Set,
+                                                     false, // propagate_nan
                                                     BlockSize,
-                                                     ReduceMThreadClusterSize,
+                                                     MThreadSliceSize,
-                                                     ReduceKThreadClusterSize,
+                                                     KThreadSliceSize,
-                                                     ReduceMThreadSliceSize,
+                                                     InSrcOutDstVectorDim,
-                                                     ReduceKThreadSliceSize,
+                                                     InSrcOutDstVectorSize,
                                                     InSrcOutDstVectorSize>;
-    std::unique_ptr<BaseArgument>
+            const auto kernel =
+                kernel_reduce_threadwise<gridwise_reduce,
+                                         OutputIndex,
+                                         true,  // pooling need to return global index
+                                         false, // don't have index input
+                                         InDataType,
+                                         OutDataType,
+                                         ComputeDataType,
+                                         IndexDataType,
+                                         AGridDesc_M_K,
+                                         BGridDesc_M,
+                                         InElementwiseOperation,
+                                         AccElementwiseOperation>;
+            ck::index_t M = arg.a_grid_desc_m_k_.GetLength(I0);
+            const index_t grid_size = (M / M_BlockTileSize);
+            return launch_and_time_kernel(stream_config,
+                                          kernel,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          arg.a_grid_desc_m_k_,
+                                          arg.b_grid_desc_m_,
+                                          arg.in_element_op_,
+                                          arg.acc_element_op_,
+                                          float(1),
+                                          arg.p_in_dev_,
+                                          nullptr,
+                                          float(0),
+                                          arg.p_out_dev_,
+                                          arg.p_out_indices_dev_);
+        }
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+        // C should be fastest dimension
+        if(pArg->input_nchw_stride_[1] != 1)
+            return false;
+        for(int i = 0; i < InOutRank; ++i)
+        {
+            if(pArg->input_nchw_stride_[i] == 1 &&
+               pArg->input_nchw_lengths_[i] % InSrcOutDstVectorSize != 0)
+                return false;
+            if(pArg->output_nchw_stride_[i] == 1 &&
+               pArg->output_nchw_lengths_[i] % InSrcOutDstVectorSize != 0)
+                return false;
+        }
+        return true;
+    }
+    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_in_dev,
                        void* p_out_dev,
                        void* p_out_indices_dev,
-                        std::vector<ck::index_t> input_lengths,
+                        std::vector<ck::index_t> input_nchw_lengths,
-                        std::vector<ck::index_t> window_lengths,
+                        std::vector<ck::index_t> window_yx_lengths,
-                        std::vector<ck::index_t> output_lengths,
+                        std::vector<ck::index_t> output_nchw_lengths,
-                        std::vector<ck::index_t> input_stride,
+                        std::vector<ck::index_t> input_nchw_stride,
-                        std::vector<ck::index_t> output_stride,
+                        std::vector<ck::index_t> output_nchw_stride,
-                        std::vector<ck::index_t> indices_stride,
+                        std::vector<ck::index_t> indices_nchw_stride,
-                        std::vector<ck::index_t> window_strides,
+                        std::vector<ck::index_t> window_yx_strides,
-                        std::vector<ck::index_t> window_dilations,
+                        std::vector<ck::index_t> window_yx_dilations,
-                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_left_hw_pads,
-                        std::vector<ck::index_t> input_right_pads,
+                        std::vector<ck::index_t> input_right_hw_pads,
                        std::vector<ck::index_t> pooling_dims) override
    {
-        static constexpr index_t InOutRank  = 4;
+        if(input_nchw_lengths.size() != InOutRank || window_yx_lengths.size() != WindowRank ||
-        static constexpr index_t WindowRank = 2;
+           input_nchw_lengths.size() != InOutRank || window_yx_strides.size() != WindowRank ||
+           window_yx_dilations.size() != WindowRank || input_left_hw_pads.size() != WindowRank ||
-        if(input_lengths.size() != InOutRank || window_lengths.size() != WindowRank ||
+           input_right_hw_pads.size() != WindowRank)
-           input_lengths.size() != InOutRank || window_strides.size() != WindowRank ||
-           window_dilations.size() != WindowRank || input_left_pads.size() != WindowRank ||
-           input_right_pads.size() != WindowRank)
            throw std::runtime_error("dimension is incorrect");
        if(pooling_dims != std::vector<ck::index_t>{2, 3})
            throw std::runtime_error("pooling_dims only support {2, 3} in pool2d so far");
-        // NCHW to NCDHW
+        if(output_nchw_stride != indices_nchw_stride)
-        input_lengths.insert(input_lengths.begin() + 2, 1);
+            throw std::runtime_error(
-        output_lengths.insert(output_lengths.begin() + 2, 1);
+                "output_nchw_stride need to be equal to indices_nchw_stride for now");
-        input_stride.insert(input_stride.begin() + 2, 0);
-        output_stride.insert(output_stride.begin() + 2, 0);
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_dev),
-        indices_stride.insert(indices_stride.begin() + 2, 0);
+                                          static_cast<OutDataType*>(p_out_dev),
+                                          static_cast<IndexDataType*>(p_out_indices_dev),
-        // YX to ZYX
+                                          input_nchw_lengths,
-        window_lengths.insert(window_lengths.begin(), 1);
+                                          output_nchw_lengths,
-        window_strides.insert(window_strides.begin(), 0);
+                                          input_nchw_stride,
-        window_dilations.insert(window_dilations.begin(), 0);
+                                          output_nchw_stride,
-        input_left_pads.insert(input_left_pads.begin(), 0);
+                                          indices_nchw_stride,
-        input_right_pads.insert(input_right_pads.begin(), 0);
+                                          window_yx_lengths,
+                                          window_yx_strides,
-        pooling_dims = {2, 3, 4};
+                                          window_yx_dilations,
+                                          input_left_hw_pads,
-        return DevicePool3D::MakeArgumentPointer(p_in_dev,
+                                          input_right_hw_pads);
-                                                 p_out_dev,
+    }
-                                                 p_out_indices_dev,
-                                                 input_lengths,
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-                                                 window_lengths,
+    {
-                                                 output_lengths,
+        return std::make_unique<Invoker>(Invoker{});
-                                                 input_stride,
+    }
-                                                 output_stride,
-                                                 indices_stride,
+    std::string GetTypeString() const override
-                                                 window_strides,
+    {
-                                                 window_dilations,
+        auto str = std::stringstream();
-                                                 input_left_pads,
-                                                 input_right_pads,
+        // clang-format off
-                                                 pooling_dims);
+        str << "DevicePool2dFwd_NHWC_NHWC<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str <<"InSrcOutDstVectorSize_" << InSrcOutDstVectorSize << ">";
+        // clang-format on
+        return str.str();
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto InOutRank  = 4;
+static constexpr auto WindowRank = 2;
+static constexpr auto MaxOp = ck::ReduceTensorOp::MAX;
+static constexpr auto AvgOp = ck::ReduceTensorOp::AVG;
+#ifdef CK_ENABLE_FP16
+// FP16
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, NHWC, NHWC, MaxOp, false>>>&);
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, NHWC, NHWC, AvgOp, false>>>&);
+// FP16 - return index
+void add_device_pool2d_fwd_nhwc_index_f16_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, NHWC, NHWC, MaxOp, true>>>&);
+#endif
+#ifdef CK_ENABLE_BF16
+// BF16
+void add_device_pool2d_fwd_nhwc_bf16_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, BF16, BF16, I32, NHWC, NHWC, MaxOp, false>>>&);
+void add_device_pool2d_fwd_nhwc_bf16_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, BF16, BF16, I32, NHWC, NHWC, AvgOp, false>>>&);
+// BF16 - return index
+void add_device_pool2d_fwd_nhwc_index_bf16_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, BF16, BF16, I32, NHWC, NHWC, MaxOp, true>>>&);
+#endif
+#ifdef CK_ENABLE_FP32
+// FP32
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, NHWC, NHWC, MaxOp, false>>>&);
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, NHWC, NHWC, AvgOp, false>>>&);
+// FP32 - return index
+void add_device_pool2d_fwd_nhwc_index_f32_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, NHWC, NHWC, MaxOp, true>>>&);
+#endif
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
+          ck::ReduceTensorOp ReduceOpId,
+          bool OutputIndex>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                                  WindowRank,
+                                                                                  InDataType,
+                                                                                  OutDataType,
+                                                                                  IndexDataType,
+                                                                                  InLayout,
+                                                                                  OutLayout,
+                                                                                  ReduceOpId,
+                                                                                  OutputIndex>>
+{
+    using DeviceOp = DevicePoolFwd<InOutRank,
+                                   WindowRank,
+                                   InDataType,
+                                   OutDataType,
+                                   IndexDataType,
+                                   InLayout,
+                                   OutLayout,
+                                   ReduceOpId,
+                                   OutputIndex>;
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(is_same_v<InLayout, NHWC> && is_same_v<OutLayout, NHWC>)
+        {
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, F16> && is_same_v<OutDataType, F16> &&
+                         is_same_v<IndexDataType, I32>)
+            {
+                if constexpr(OutputIndex && ReduceOpId == MaxOp)
+                {
+                    add_device_pool2d_fwd_nhwc_index_f16_instances(op_ptrs);
+                }
+                else
+                {
+                    add_device_pool2d_fwd_nhwc_f16_instances(op_ptrs);
+                }
+            }
+#endif
+#ifdef CK_ENABLE_BF16
+            else if constexpr(is_same_v<InDataType, BF16> && is_same_v<OutDataType, BF16> &&
+                              is_same_v<IndexDataType, I32>)
+            {
+                if constexpr(OutputIndex && ReduceOpId == MaxOp)
+                {
+                    add_device_pool2d_fwd_nhwc_index_bf16_instances(op_ptrs);
+                }
+                else
+                {
+                    add_device_pool2d_fwd_nhwc_bf16_instances(op_ptrs);
+                }
+            }
+#endif
+#ifdef CK_ENABLE_FP32
+            else if constexpr(is_same_v<InDataType, F32> && is_same_v<OutDataType, F32> &&
+                              is_same_v<IndexDataType, I32>)
+            {
+                if constexpr(OutputIndex && ReduceOpId == MaxOp)
+                {
+                    add_device_pool2d_fwd_nhwc_index_f32_instances(op_ptrs);
+                }
+                else
+                {
+                    add_device_pool2d_fwd_nhwc_f32_instances(op_ptrs);
+                }
+            }
+#endif
+        }
+        return op_ptrs;
+    }
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/CMakeLists.txt
+set(DEVICE_POOL2D_FWD_INSTANCES)
+list(APPEND DEVICE_POOL2D_FWD_INSTANCES device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+                                        device_max_pool2d_fwd_nhwc_f16_instance.cpp
+                                        device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+                                        device_max_pool2d_fwd_nhwc_f32_instance.cpp
+                                        device_avg_pool2d_fwd_nhwc_bf16_instance.cpp
+                                        device_max_pool2d_fwd_nhwc_bf16_instance.cpp)
+add_instance_library(device_pool2d_fwd_instance ${DEVICE_POOL2D_FWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool2d_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+void add_device_pool2d_fwd_nhwc_bf16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<4, 2, BF16, BF16, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<BF16, BF16, I32, F32, ReduceOpId, false>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool2d_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool2d_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool2d_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+void add_device_pool2d_fwd_nhwc_bf16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<4, 2, BF16, BF16, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<BF16, BF16, I32, F32, ReduceOpId, false>{});
+}
+void add_device_pool2d_fwd_nhwc_index_bf16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<4, 2, BF16, BF16, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<BF16, BF16, I32, F32, ReduceOpId, true>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool2d_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
+}
+void add_device_pool2d_fwd_nhwc_index_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, true>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "pool2d_fwd_instance_common.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+void add_device_pool2d_fwd_nhwc_index_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, true>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/pool2d_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/pool2d_fwd_instance_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+using I32  = int32_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+using NHWC = ck::tensor_layout::convolution::NHWC;
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ComputeDataType,
+          ReduceTensorOp ReduceOpId,
+          bool OutputIndex>
+using device_pool2d_fwd_nhwc_instances =
+    // clang-format off
+    std::tuple <
+        DevicePool2dFwd_NHWC_NHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 1, 1, 1>,
+        DevicePool2dFwd_NHWC_NHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 2, 1, 2>,
+        DevicePool2dFwd_NHWC_NHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 4, 1, 4>
+               // clang-format on
+               >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iomanip>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+namespace ck {
+namespace profiler {
+template <typename InDataType,
+          typename OutDataType,
+          typename ComputeDataType,
+          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+bool profile_pool2d_fwd_impl(int do_verification,
+                             int init_method,
+                             bool do_log,
+                             bool time_kernel,
+                             std::vector<index_t> in_length, // NCHW
+                             std::vector<index_t> window_spatial_lengths,
+                             std::vector<index_t> window_strides,
+                             std::vector<index_t> window_dilations,
+                             std::vector<index_t> input_left_pads,
+                             std::vector<index_t> input_right_pads)
+{
+    constexpr index_t InOutRank  = 4;
+    constexpr index_t WindowRank = 2;
+    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
+       window_strides.size() != WindowRank || window_dilations.size() != WindowRank ||
+       input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
+        return false;
+    std::vector<index_t> out_length(InOutRank);
+    int N = in_length[0];
+    int C = in_length[1];
+    out_length[0] = N;
+    out_length[1] = C;
+    // Calculate Ho, Wo
+    for(int i = 2; i < InOutRank; ++i)
+    {
+        auto pad1             = input_left_pads[i - 2];
+        auto pad2             = input_right_pads[i - 2];
+        auto windows_size     = window_spatial_lengths[i - 2];
+        auto windows_stride   = window_strides[i - 2];
+        auto windows_dilation = window_dilations[i - 2];
+        auto eff              = (windows_size - 1) * windows_dilation + 1;
+        out_length[i]         = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
+    }
+    int Hi = in_length[2];
+    int Wi = in_length[3];
+    int Ho = out_length[2];
+    int Wo = out_length[3];
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+        };
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+    switch(init_method)
+    {
+    case 0: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
+    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
+                                     out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 InLayout,
+                                                                 OutLayout,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    if(do_verification)
+    {
+        using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
+                                                                                  WindowRank,
+                                                                                  InDataType,
+                                                                                  OutDataType,
+                                                                                  ComputeDataType,
+                                                                                  IndexDataType,
+                                                                                  ReduceOpId,
+                                                                                  PropagateNan,
+                                                                                  OutputIndex>;
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(in_n_c_hi_wi,
+                                             out_n_c_ho_wo_host,
+                                             out_indices_n_c_ho_wo_host,
+                                             window_spatial_lengths,
+                                             window_strides,
+                                             window_dilations,
+                                             input_left_pads,
+                                             input_right_pads);
+        auto ref_invoker  = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+    int num_kernel = 0;
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            {C * Hi * Wi, 1, Wi * C, C},
+            {C * Ho * Wo, 1, Wo * C, C},
+            {C * Ho * Wo, 1, Wo * C, C},
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads,
+            {2, 3});
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl;
+            }
+            continue;
+        }
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        std::size_t num_bytes = in_n_c_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) +
+                                out_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType);
+        if constexpr(OutputIndex)
+            num_bytes += out_indices_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(IndexDataType);
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+        if(do_verification)
+        {
+            out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
+            bool pass = ck::utils::check_err(out_n_c_ho_wo_device.mData,
+                                             out_n_c_ho_wo_host.mData,
+                                             "Error: Incorrect results",
+                                             1e-3,
+                                             1e-3);
+            if constexpr(OutputIndex)
+            {
+                out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
+                pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device,
+                                                    out_indices_n_c_ho_wo_host);
+            }
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "in_n_c_hi_wi  : ", in_n_c_hi_wi.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_ho_wo_host  : ", out_n_c_ho_wo_host.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_ho_wo_device  : ", out_n_c_ho_wo_device.mData, ",")
+                    << std::endl;
+                if constexpr(OutputIndex)
+                    LogRangeAsType<float>(std::cout << "out_indices_n_c_ho_wo_device  : ",
+                                          out_indices_n_c_ho_wo_device.mData,
+                                          ",")
+                        << std::endl;
+            }
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", in_length, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+    return true;
+}
+} // namespace profiler
+} // namespace ck
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -9,6 +9,7 @@ set(PROFILER_SOURCES
    profile_layernorm_bwd_gamma_beta.cpp
    profile_groupnorm_bwd_gamma_beta.cpp
    profile_layernorm_fwd.cpp
+    profile_max_pool2d_fwd.cpp
    profile_max_pool3d_fwd.cpp
    profile_avg_pool3d_bwd.cpp
    profile_max_pool3d_bwd.cpp
@@ -98,6 +99,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_ga
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool2d_fwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)

--- a/profiler/src/profile_max_pool2d_fwd.cpp
+++ b/profiler/src/profile_max_pool2d_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_pool2d_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"
+using ck::index_t;
+struct maxPoolFwdArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
+                                                                   {"wsize", {}},
+                                                                   {"wstride", {}},
+                                                                   {"wdilation", {}},
+                                                                   {"pad1", {}},
+                                                                   {"pad2", {}}};
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+void print_help_max_pool2d_fwd()
+{
+    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
+              << "arg2: verification (0: no; 1: yes)\n"
+              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg4: print tensor value (0: no; 1: yes)\n"
+              << "arg5: time kernel (0=no, 1=yes)\n"
+              << "arg6: return index (0=no, 1=yes)\n"
+              << "--length: input tensor length for NCHW(e.g, --length 2 32 30 30) \n"
+              << "--wsize: window size for YX (e.g, --wsize 2 2) \n"
+              << "--wstride: window stride for HW (e.g, --wstride 2 2) \n"
+              << "--wdilation: window dilation for HW (e.g, --wdilation 1 1) \n"
+              << "--pad1: left side of padding in HW (e.g, --pad1 1 1) \n"
+              << "--pad2: right side of padding in HW (e.g, --pad2 1 1) \n"
+              << "eg: ckProfiler max_pool2d_fwd 0 1 2 0 1 0 --length 2 32 30 30 --wsize 2 2"
+                 "--wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1"
+              << std::endl;
+}
+int profile_max_pool2d_fwd(int argc, char* argv[])
+{
+    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
+    bool do_verification       = true;
+    int init_method            = 0;
+    bool do_log                = false;
+    bool time_kernel           = true;
+    bool return_index          = false;
+    std::vector<index_t> in_length = {2, 32, 30, 30};
+    std::vector<index_t> wsize     = {2, 2};
+    std::vector<index_t> wstride   = {2, 2};
+    std::vector<index_t> wdilation = {1, 1};
+    std::vector<index_t> pad1      = {1, 1};
+    std::vector<index_t> pad2      = {1, 1};
+    if(argc != 2 && argc != 28)
+    {
+        print_help_max_pool2d_fwd();
+        return 0;
+    }
+    else if(argc == 28)
+    {
+        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+        do_verification = std::stoi(argv[3]);
+        init_method     = std::stoi(argv[4]);
+        do_log          = std::stoi(argv[5]);
+        time_kernel     = std::stoi(argv[6]);
+        return_index    = std::stoi(argv[7]);
+        // parse the long options
+        maxPoolFwdArgParser arg_parser;
+        arg_parser(argc, argv);
+        in_length = arg_parser.long_opts["length"];
+        wsize     = arg_parser.long_opts["wsize"];
+        wstride   = arg_parser.long_opts["wstride"];
+        wdilation = arg_parser.long_opts["wdilation"];
+        pad1      = arg_parser.long_opts["pad1"];
+        pad2      = arg_parser.long_opts["pad2"];
+    }
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+    using F32  = float;
+    using I32  = int32_t;
+    using NHWC = ck::tensor_layout::convolution::NHWC;
+    constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+    if(data_type == ck::DataTypeEnum::Half)
+    {
+        if(return_index)
+        {
+            ck::profiler::
+                profile_pool2d_fwd_impl<F16, F16, F16, I32, NHWC, NHWC, ReduceOpId, false, true>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    in_length,
+                    wsize,
+                    wstride,
+                    wdilation,
+                    pad1,
+                    pad2);
+        }
+        else
+        {
+            ck::profiler::
+                profile_pool2d_fwd_impl<F16, F16, F16, I32, NHWC, NHWC, ReduceOpId, false, false>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    in_length,
+                    wsize,
+                    wstride,
+                    wdilation,
+                    pad1,
+                    pad2);
+        }
+    }
+    else if(data_type == ck::DataTypeEnum::BFloat16)
+    {
+        if(return_index)
+        {
+            ck::profiler::
+                profile_pool2d_fwd_impl<BF16, BF16, BF16, I32, NHWC, NHWC, ReduceOpId, false, true>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    in_length,
+                    wsize,
+                    wstride,
+                    wdilation,
+                    pad1,
+                    pad2);
+        }
+        else
+        {
+            ck::profiler::profile_pool2d_fwd_impl<BF16,
+                                                  BF16,
+                                                  BF16,
+                                                  I32,
+                                                  NHWC,
+                                                  NHWC,
+                                                  ReduceOpId,
+                                                  false,
+                                                  false>(do_verification,
+                                                         init_method,
+                                                         do_log,
+                                                         time_kernel,
+                                                         in_length,
+                                                         wsize,
+                                                         wstride,
+                                                         wdilation,
+                                                         pad1,
+                                                         pad2);
+        }
+    }
+    else if(data_type == ck::DataTypeEnum::Float)
+    {
+        if(return_index)
+        {
+            ck::profiler::
+                profile_pool2d_fwd_impl<F32, F32, F32, I32, NHWC, NHWC, ReduceOpId, false, true>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    in_length,
+                    wsize,
+                    wstride,
+                    wdilation,
+                    pad1,
+                    pad2);
+        }
+        else
+        {
+            ck::profiler::
+                profile_pool2d_fwd_impl<F32, F32, F32, I32, NHWC, NHWC, ReduceOpId, false, false>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    in_length,
+                    wsize,
+                    wstride,
+                    wdilation,
+                    pad1,
+                    pad2);
+        }
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+    return 0;
+}
+REGISTER_PROFILER_OPERATION("max_pool2d_fwd", "max_pool2d fwd", profile_max_pool2d_fwd);
--- a/test/pool/CMakeLists.txt
+++ b/test/pool/CMakeLists.txt
@@ -4,13 +4,19 @@ add_gtest_executable(test_avg_pool3d_bwd test_avg_pool3d_bwd.cpp)
 add_gtest_executable(test_max_pool3d_bwd test_max_pool3d_bwd.cpp)
 add_gtest_executable(test_avg_pool3d_fwd test_avg_pool3d_fwd.cpp)
 add_gtest_executable(test_max_pool3d_fwd test_max_pool3d_fwd.cpp)
+add_gtest_executable(test_avg_pool2d_fwd test_avg_pool2d_fwd.cpp)
+add_gtest_executable(test_max_pool2d_fwd test_max_pool2d_fwd.cpp)
 target_link_libraries(test_avg_pool3d_bwd PRIVATE utility device_avg_pool3d_bwd_instance)
 target_link_libraries(test_max_pool3d_bwd PRIVATE utility device_max_pool_bwd_instance)
 target_link_libraries(test_avg_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance)
 target_link_libraries(test_max_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance)
+target_link_libraries(test_avg_pool2d_fwd PRIVATE utility device_pool2d_fwd_instance)
+target_link_libraries(test_max_pool2d_fwd PRIVATE utility device_pool2d_fwd_instance)
 add_dependencies(test_pool test_avg_pool3d_bwd)
 add_dependencies(test_pool test_max_pool3d_bwd)
 add_dependencies(test_pool test_avg_pool3d_fwd)
 add_dependencies(test_pool test_max_pool3d_fwd)
+add_dependencies(test_pool test_avg_pool2d_fwd)
+add_dependencies(test_pool test_max_pool2d_fwd)
--- a/test/pool/test_avg_pool2d_fwd.cpp
+++ b/test/pool/test_avg_pool2d_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "profiler/profile_pool2d_fwd_impl.hpp"
+#include "test_pool_fwd_common.hpp"
+template <typename Tuple>
+class TestAvgPool2dFwd : public ::testing::Test
+{
+    protected:
+    using InDataType      = std::tuple_element_t<0, Tuple>;
+    using OutDataType     = std::tuple_element_t<1, Tuple>;
+    using ComputeDataType = std::tuple_element_t<2, Tuple>;
+    using IndexDataType   = std::tuple_element_t<3, Tuple>;
+    std::vector<PoolingParam> params;
+    void Run()
+    {
+        for(auto param : params)
+        {
+            // avg pool
+            bool success =
+                ck::profiler::profile_pool2d_fwd_impl<InDataType,
+                                                      OutDataType,
+                                                      ComputeDataType,
+                                                      IndexDataType,
+                                                      ck::tensor_layout::convolution::NHWC,
+                                                      ck::tensor_layout::convolution::NHWC,
+                                                      ck::ReduceTensorOp::AVG,
+                                                      false,
+                                                      false>(true,
+                                                             2,
+                                                             false,
+                                                             false,
+                                                             param.length_,
+                                                             param.window_spatial_lengths_,
+                                                             param.window_strides_,
+                                                             param.window_dilations_,
+                                                             param.input_left_pads_,
+                                                             param.input_right_pads_);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+using KernelTypes = std::conditional_t<
+    CK_ENABLE_FP16 && CK_ENABLE_BF16,
+    ::testing::Types<std::tuple<F16, F16, F32, I32>,
+                     std::tuple<F16, F16, F32, I32>,
+                     std::tuple<BF16, BF16, F32, I32>,
+                     std::tuple<BF16, BF16, F32, I32>,
+                     std::tuple<F32, F32, F32, I32>,
+                     std::tuple<F32, F32, F32, I32>>,
+    ::testing::Types<std::tuple<F32, F32, F32, I32>, std::tuple<F32, F32, F32, I32>>>;
+TYPED_TEST_SUITE(TestAvgPool2dFwd, KernelTypes);
+TYPED_TEST(TestAvgPool2dFwd, Test_Pool)
+{
+    // length, window_length, window_stride, window_dilation, left_pad, right_pad
+    this->params = {{{1, 1, 1, 1}, {1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}},
+                    {{2, 16, 64, 64}, {64, 64}, {1, 1}, {1, 1}, {0, 0}, {0, 0}},
+                    {{2, 16, 64, 64}, {4, 4}, {4, 4}, {2, 2}, {0, 0}, {0, 0}},
+                    {{2, 32, 30, 30}, {2, 2}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}};
+    this->Run();
+}
--- a/test/pool/test_max_pool2d_fwd.cpp
+++ b/test/pool/test_max_pool2d_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "profiler/profile_pool2d_fwd_impl.hpp"
+#include "test_pool_fwd_common.hpp"
+template <typename Tuple>
+class TestMaxPool2dFwd : public ::testing::Test
+{
+    protected:
+    using InDataType                  = std::tuple_element_t<0, Tuple>;
+    using OutDataType                 = std::tuple_element_t<1, Tuple>;
+    using ComputeDataType             = std::tuple_element_t<2, Tuple>;
+    using IndexDataType               = std::tuple_element_t<3, Tuple>;
+    static constexpr bool ReturnIndex = std::tuple_element_t<4, Tuple>::value;
+    std::vector<PoolingParam> params;
+    void Run()
+    {
+        for(auto param : params)
+        {
+            // max pool
+            bool success =
+                ck::profiler::profile_pool2d_fwd_impl<InDataType,
+                                                      OutDataType,
+                                                      ComputeDataType,
+                                                      IndexDataType,
+                                                      ck::tensor_layout::convolution::NHWC,
+                                                      ck::tensor_layout::convolution::NHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      ReturnIndex>(true,
+                                                                   2,
+                                                                   false,
+                                                                   false,
+                                                                   param.length_,
+                                                                   param.window_spatial_lengths_,
+                                                                   param.window_strides_,
+                                                                   param.window_dilations_,
+                                                                   param.input_left_pads_,
+                                                                   param.input_right_pads_);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+using true_t  = std::integral_constant<bool, true>;
+using false_t = std::integral_constant<bool, false>;
+using KernelTypes = std::conditional_t<CK_ENABLE_FP16 && CK_ENABLE_BF16,
+                                       ::testing::Types<std::tuple<F16, F16, F32, I32, true_t>,
+                                                        std::tuple<F16, F16, F32, I32, false_t>,
+                                                        std::tuple<BF16, BF16, F32, I32, true_t>,
+                                                        std::tuple<BF16, BF16, F32, I32, false_t>,
+                                                        std::tuple<F32, F32, F32, I32, true_t>,
+                                                        std::tuple<F32, F32, F32, I32, false_t>>,
+                                       ::testing::Types<std::tuple<F32, F32, F32, I32, true_t>,
+                                                        std::tuple<F32, F32, F32, I32, false_t>>>;
+TYPED_TEST_SUITE(TestMaxPool2dFwd, KernelTypes);
+TYPED_TEST(TestMaxPool2dFwd, Test_Pool)
+{
+    // length, window_length, window_stride, window_dilation, left_pad, right_pad
+    this->params = {{{1, 1, 1, 1}, {1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}},
+                    {{2, 16, 64, 64}, {64, 64}, {1, 1}, {1, 1}, {0, 0}, {0, 0}},
+                    {{2, 16, 64, 64}, {4, 4}, {4, 4}, {2, 2}, {0, 0}, {0, 0}},
+                    {{2, 32, 30, 30}, {2, 2}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}};
+    this->Run();
+}