Maxpool bwd (#750)

* Add maxpool f32 kernel and example * Revise copyright * Add device pool bwd device op * Support f16 and bf16 * Add compute datatype for reference code. Prevent error in bf16 * Fix type error * Remove layout * Fix bf16 error * Add f16 and bf16 example * Add more operations * Implement IsSupportedArgument * Add changelog * Add comment * Add comment * Remove useless header * Move initialize of workspace to the run * Move set din zero to the device operator * Save din_length_raw * Remove useless header * Calculate gridsize according to the number of CU * Calculate gridSize according to the number of CU. Remove useless header * Add put example * Remove useless header * Fix CI fail

Maxpool bwd (#750)
* Add maxpool f32 kernel and example * Revise copyright * Add device pool bwd device op * Support f16 and bf16 * Add compute datatype for reference code. Prevent error in bf16 * Fix type error * Remove layout * Fix bf16 error * Add f16 and bf16 example * Add more operations * Implement IsSupportedArgument * Add changelog * Add comment * Add comment * Remove useless header * Move initialize of workspace to the run * Move set din zero to the device operator * Save din_length_raw * Remove useless header * Calculate gridsize according to the number of CU * Calculate gridSize according to the number of CU. Remove useless header * Add put example * Remove useless header * Fix CI fail
341ad956 · rocking · GitHub · 0d911822 · 341ad956 · 341ad956
Unverified Commit 341ad956 authored Jun 19, 2023 by rocking Committed by GitHub Jun 19, 2023
16 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,8 @@ Full documentation for Composable Kernel is not yet available.
 - Added multi-embeddings support (#542).
 - Added Navi3x blockwise GEMM and real GEMM support (#541).
 - Added Navi grouped ConvBwdWeight support (#505).
+- Added pool3d forward (#697).
+- Added maxpool backward (#750).
 ### Changed
 - Changed ...
--- a/example/49_maxpool2d_bwd/CMakeLists.txt
+++ b/example/49_maxpool2d_bwd/CMakeLists.txt
+add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp)
+add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp)
+add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp)
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "maxpool2d_bwd_common.hpp"
+using InDataType      = ck::bhalf_t;
+using OutDataType     = ck::bhalf_t;
+using IndexDataType   = int32_t;
+using ComputeDataType = float;
+using DInDataType     = ck::bhalf_t;
+using DOutDataType    = ck::bhalf_t;
+static constexpr bool PropagateNan = false;
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+    // Pool shape
+    ck::index_t N               = 1;
+    ck::index_t C               = 1;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 32;
+    ck::index_t Wi              = 32;
+    ck::index_t window_stride_h = 1;
+    ck::index_t window_stride_w = 1;
+    ck::index_t in_left_pad_h   = 0;
+    ck::index_t in_left_pad_w   = 0;
+    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_right_pad_w  = 0;
+    bool pass = maxpool_bwd_test<InDataType,
+                                 OutDataType,
+                                 IndexDataType,
+                                 ComputeDataType,
+                                 DInDataType,
+                                 DOutDataType,
+                                 PropagateNan>(do_verification,
+                                               time_kernel,
+                                               N,
+                                               C,
+                                               Y,
+                                               X,
+                                               Hi,
+                                               Wi,
+                                               window_stride_h,
+                                               window_stride_w,
+                                               in_left_pad_h,
+                                               in_left_pad_w,
+                                               in_right_pad_h,
+                                               in_right_pad_w);
+    return (pass ? 0 : 1);
+}
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ComputeDataType,
+          typename DInDataType,
+          typename DOutDataType,
+          bool PropagateNan>
+bool maxpool_bwd_test(bool do_verification,
+                      bool time_kernel,
+                      ck::index_t N,
+                      ck::index_t C,
+                      ck::index_t Y,
+                      ck::index_t X,
+                      ck::index_t Hi,
+                      ck::index_t Wi,
+                      ck::index_t window_stride_h,
+                      ck::index_t window_stride_w,
+                      ck::index_t in_left_pad_h,
+                      ck::index_t in_left_pad_w,
+                      ck::index_t in_right_pad_h,
+                      ck::index_t in_right_pad_w)
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using DevicePoolFwdInstance =
+        ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
+            InDataType,      // InDataType
+            OutDataType,     // OutDataType
+            IndexDataType,   // IndexDataType
+            ComputeDataType, // ComputeDataType
+            ck::ReduceTensorOp::MAX,
+            true, // OutputIndex
+            64,   // BlockSize
+            64,   // ReduceMThreadClusterSize
+            1,    // ReduceKThreadClusterSize
+            4,    // ReduceMThreadSliceSize
+            1,    // ReduceKThreadSliceSize
+            1>;   // InSrcOutDstVectorSize
+    using DeviceMaxPoolBwdInstance = ck::tensor_operation::device::
+        DeviceIndexPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+    const std::vector<ck::index_t> window_spatial_lengths{Y, X};
+    const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
+    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
+    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+            // reference need Tensor with NCHW order
+            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+        };
+    // in
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
+    // out
+    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+    // indices
+    Tensor<IndexDataType> indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+    // dout
+    Tensor<DOutDataType> dout_n_c_ho_wo(f_host_tensor_descriptor(N, C, Ho, Wo));
+    // din
+    Tensor<DInDataType> din_n_c_hi_wi_host(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<DInDataType> din_n_c_hi_wi_device(f_host_tensor_descriptor(N, C, Hi, Wi));
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;
+    std::cout << "indices_n_c_ho_wo: " << indices_n_c_ho_wo_host.mDesc << std::endl;
+    std::cout << "dout_n_c_ho_wo: " << dout_n_c_ho_wo.mDesc << std::endl;
+    std::cout << "din_n_c_hi_wi: " << din_n_c_hi_wi_host.mDesc << std::endl;
+    in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+    dout_n_c_ho_wo.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{-1.0, 1.0});
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem indices_device_buf(sizeof(IndexDataType) *
+                                 indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem dout_device_buf(sizeof(DOutDataType) * dout_n_c_ho_wo.mDesc.GetElementSpaceSize());
+    DeviceMem din_device_buf(sizeof(DInDataType) *
+                             din_n_c_hi_wi_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    dout_device_buf.ToDevice(dout_n_c_ho_wo.mData.data());
+    auto pool_fwd              = DevicePoolFwdInstance{};
+    auto pool_fwd_invoker_ptr  = pool_fwd.MakeInvokerPointer();
+    auto pool_fwd_argument_ptr = pool_fwd.MakeArgumentPointer(
+        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+        {N, C, Hi, Wi},
+        window_spatial_lengths,
+        {N, C, Ho, Wo},
+        {C * Hi * Wi, 1, Wi * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
+        window_strides,
+        input_left_pads,
+        input_right_pads,
+        {2, 3});
+    if(!pool_fwd.IsSupportedArgument(pool_fwd_argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! pool_fwd with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+    float ave_time_fwd =
+        pool_fwd_invoker_ptr->Run(pool_fwd_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    auto pool_bwd              = DeviceMaxPoolBwdInstance{};
+    auto pool_bwd_invoker_ptr  = pool_bwd.MakeInvokerPointer();
+    auto pool_bwd_argument_ptr = pool_bwd.MakeArgumentPointer(
+        static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+        static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+        dout_n_c_ho_wo.mDesc.GetElementSpaceSize(),
+        din_n_c_hi_wi_device.mDesc.GetElementSpaceSize(),
+        window_spatial_lengths,
+        window_strides);
+    if(!pool_bwd.IsSupportedArgument(pool_bwd_argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! pool_bwd with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+    size_t pool_bwd_workspace_sz = pool_bwd.GetWorkSpaceSize(pool_bwd_argument_ptr.get());
+    DeviceMem pool_bwd_workspace_device_buf(pool_bwd_workspace_sz);
+    pool_bwd.SetWorkSpacePointer(pool_bwd_argument_ptr.get(),
+                                 pool_bwd_workspace_device_buf.GetDeviceBuffer());
+    float ave_time_bwd =
+        pool_bwd_invoker_ptr->Run(pool_bwd_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    std::cout << "Pool fwd perf: " << ave_time_fwd << " ms" << std::endl;
+    std::cout << "Pool bwd perf: " << ave_time_bwd << " ms" << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        using ReferencePoolingFwdInstance =
+            ck::tensor_operation::host::ReferencePoolingFwd<4,
+                                                            2,
+                                                            InDataType,
+                                                            OutDataType,
+                                                            ComputeDataType,
+                                                            IndexDataType,
+                                                            ck::ReduceTensorOp::MAX,
+                                                            PropagateNan,
+                                                            true>;
+        auto ref_pooling_fwd          = ReferencePoolingFwdInstance{};
+        auto ref_pooling_fwd_invoker  = ref_pooling_fwd.MakeInvoker();
+        auto ref_pooling_fwd_argument = ref_pooling_fwd.MakeArgument(in_n_c_hi_wi,
+                                                                     out_n_c_ho_wo_host,
+                                                                     indices_n_c_ho_wo_host,
+                                                                     window_spatial_lengths,
+                                                                     window_strides,
+                                                                     input_left_pads,
+                                                                     input_right_pads);
+        ref_pooling_fwd_invoker.Run(ref_pooling_fwd_argument);
+        using ReferencePoolingBwdInstance =
+            ck::tensor_operation::host::ReferenceMaxPoolBwd<DOutDataType,
+                                                            IndexDataType,
+                                                            ComputeDataType,
+                                                            DInDataType,
+                                                            PassThrough>;
+        auto ref_pooling_bwd          = ReferencePoolingBwdInstance{};
+        auto ref_pooling_bwd_invoker  = ref_pooling_bwd.MakeInvoker();
+        auto ref_pooling_bwd_argument = ref_pooling_bwd.MakeArgument(
+            dout_n_c_ho_wo, indices_n_c_ho_wo_host, din_n_c_hi_wi_host, PassThrough{});
+        ref_pooling_bwd_invoker.Run(ref_pooling_bwd_argument);
+        out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
+        indices_device_buf.FromDevice(indices_n_c_ho_wo_device.mData.data());
+        din_device_buf.FromDevice(din_n_c_hi_wi_device.mData.data());
+        pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
+        pass = pass && ck::utils::check_err(indices_n_c_ho_wo_device, indices_n_c_ho_wo_host);
+        pass = pass && ck::utils::check_err(din_n_c_hi_wi_device, din_n_c_hi_wi_host);
+    }
+    return (pass);
+};
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "maxpool2d_bwd_common.hpp"
+using InDataType      = ck::half_t;
+using OutDataType     = ck::half_t;
+using IndexDataType   = int32_t;
+using ComputeDataType = float;
+using DInDataType     = ck::half_t;
+using DOutDataType    = ck::half_t;
+static constexpr bool PropagateNan = false;
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+    // Pool shape
+    ck::index_t N               = 1;
+    ck::index_t C               = 1;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 32;
+    ck::index_t Wi              = 32;
+    ck::index_t window_stride_h = 1;
+    ck::index_t window_stride_w = 1;
+    ck::index_t in_left_pad_h   = 0;
+    ck::index_t in_left_pad_w   = 0;
+    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_right_pad_w  = 0;
+    bool pass = maxpool_bwd_test<InDataType,
+                                 OutDataType,
+                                 IndexDataType,
+                                 ComputeDataType,
+                                 DInDataType,
+                                 DOutDataType,
+                                 PropagateNan>(do_verification,
+                                               time_kernel,
+                                               N,
+                                               C,
+                                               Y,
+                                               X,
+                                               Hi,
+                                               Wi,
+                                               window_stride_h,
+                                               window_stride_w,
+                                               in_left_pad_h,
+                                               in_left_pad_w,
+                                               in_right_pad_h,
+                                               in_right_pad_w);
+    return (pass ? 0 : 1);
+}
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "maxpool2d_bwd_common.hpp"
+using InDataType      = float;
+using OutDataType     = float;
+using IndexDataType   = int32_t;
+using ComputeDataType = float;
+using DInDataType     = float;
+using DOutDataType    = float;
+static constexpr bool PropagateNan = false;
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+    // Pool shape
+    ck::index_t N               = 1;
+    ck::index_t C               = 1;
+    ck::index_t Y               = 2;
+    ck::index_t X               = 2;
+    ck::index_t Hi              = 32;
+    ck::index_t Wi              = 32;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_h   = 0;
+    ck::index_t in_left_pad_w   = 0;
+    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_right_pad_w  = 0;
+    bool pass = maxpool_bwd_test<InDataType,
+                                 OutDataType,
+                                 IndexDataType,
+                                 ComputeDataType,
+                                 DInDataType,
+                                 DOutDataType,
+                                 PropagateNan>(do_verification,
+                                               time_kernel,
+                                               N,
+                                               C,
+                                               Y,
+                                               X,
+                                               Hi,
+                                               Wi,
+                                               window_stride_h,
+                                               window_stride_w,
+                                               in_left_pad_h,
+                                               in_left_pad_w,
+                                               in_right_pad_h,
+                                               in_right_pad_w);
+    return (pass ? 0 : 1);
+}
--- a/example/50_put_element/CMakeLists.txt
+++ b/example/50_put_element/CMakeLists.txt
+add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
--- a/example/50_put_element/put_element_fp16.cpp
+++ b/example/50_put_element/put_element_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+using XDataType     = ck::half_t;
+using YDataType     = ck::half_t;
+using IndexDataType = int32_t;
+using YElementwiseOp = ck::tensor_operation::element_wise::PassThrough;
+using DeviceInstance =
+    ck::tensor_operation::device::DevicePutElementImpl<XDataType,     // XDataType
+                                                       IndexDataType, // IndexDataType
+                                                       YDataType,     // YDataType
+                                                       YElementwiseOp,
+                                                       ck::InMemoryDataOperationEnum::Set,
+                                                       1>;
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+    int N = 1024;
+    Tensor<XDataType> x(HostTensorDescriptor{N, 1});
+    Tensor<IndexDataType> indices(HostTensorDescriptor{N, 1});
+    Tensor<YDataType> y(HostTensorDescriptor{N, 1});
+    x.GenerateTensorValue(GeneratorTensor_3<XDataType>{-1.0, 1.0});
+    for(int i = 0; i < N; ++i)
+        indices(i) = i;
+    DeviceMem x_device_buf(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem y_device_buf(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+    DeviceMem indices_device_buf(sizeof(IndexDataType) * indices.mDesc.GetElementSpaceSize());
+    x_device_buf.ToDevice(x.mData.data());
+    indices_device_buf.ToDevice(indices.mData.data());
+    auto put_instance     = DeviceInstance{};
+    auto put_invoker_ptr  = put_instance.MakeInvokerPointer();
+    auto put_argument_ptr = put_instance.MakeArgumentPointer(
+        static_cast<XDataType*>(x_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+        static_cast<YDataType*>(y_device_buf.GetDeviceBuffer()),
+        N,
+        N,
+        YElementwiseOp{});
+    if(!put_instance.IsSupportedArgument(put_argument_ptr.get()))
+    {
+        throw std::runtime_error("argument is not supported!");
+    }
+    float ave_time =
+        put_invoker_ptr->Run(put_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    std::cout << "perf: " << ave_time << " ms" << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        Tensor<YDataType> y_host(HostTensorDescriptor{N, 1});
+        for(int i = 0; i < N; ++i)
+        {
+            IndexDataType idx = indices(i);
+            y_host(idx)       = x(i);
+        }
+        y_device_buf.FromDevice(y.mData.data());
+        pass = ck::utils::check_err(y, y_host);
+    }
+    return (pass ? 0 : 1);
+}
--- a/include/ck/host_utility/stream_utility.hpp
+++ b/include/ck/host_utility/stream_utility.hpp
@@ -8,7 +8,7 @@
 #include "ck/stream_config.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
-static int getAvailableComputeUnitCount(const StreamConfig& stream_config)
+static inline int getAvailableComputeUnitCount(const StreamConfig& stream_config)
 {
    constexpr int MAX_MASK_DWORDS = 64;

--- a/include/ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <vector>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// For pooling which used indexable operation, such as MaxPool, MinPool...etc
+template <typename DOutDataType, typename IndexDataType, typename DInDataType>
+struct DeviceIndexPoolBwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_dout,
+                        const void* p_indices,
+                        void* p_din,
+                        index_t dout_length,
+                        index_t din_length,
+                        std::vector<ck::index_t> window_lengths,
+                        std::vector<ck::index_t> window_strides) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_put_element.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_put_element.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <vector>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/utility/reduction_enums.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// output[indices] = input
+template <typename InDataType,
+          typename IndexDataType,
+          typename OutDataType,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum Op>
+struct DevicePutElement : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_input,
+                        const void* p_indices,
+                        void* p_output,
+                        index_t input_length,
+                        index_t output_length,
+                        ElementwiseOperation elementwise_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// output[indices] = input
+template <typename DOutDataType,
+          typename IndexDataType,
+          typename DInDataType,
+          ck::index_t InOutVectorSize>
+struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDataType, DInDataType>
+{
+    using DInDataType_AutomicAddPreCast =
+        conditional_t<is_same_v<DInDataType, float> || is_same_v<DInDataType, double>,
+                      DInDataType,
+                      float>;
+    using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+    using UnaryConvert = ck::tensor_operation::element_wise::UnaryConvert;
+    static constexpr auto I0 = Number<0>{};
+    template <typename Desc_M>
+    static auto PadDescriptor_M_1d(Desc_M desc_m, index_t loop_step)
+    {
+        const auto m   = desc_m.GetLength(I0);
+        const auto pad = math::integer_least_multiple(m, loop_step) - m;
+        const auto desc_m_pad =
+            transform_tensor_descriptor(desc_m,
+                                        make_tuple(make_right_pad_transform(m, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m_pad;
+    }
+    static auto MakeDescriptor_M(index_t length, index_t loop_step)
+    {
+        const auto desc_m = make_naive_tensor_descriptor_packed(make_tuple(length));
+        return PadDescriptor_M_1d(desc_m, loop_step);
+    }
+    using InOutGrid1dDesc = decltype(MakeDescriptor_M(1, 1));
+    using GridwisePutElementSet = GridwisePutElement_1D<InOutGrid1dDesc,
+                                                        DOutDataType,
+                                                        IndexDataType,
+                                                        DInDataType,
+                                                        PassThrough,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        InOutVectorSize>;
+    using GridwisePutElementAtomicAdd = GridwisePutElement_1D<InOutGrid1dDesc,
+                                                              DOutDataType,
+                                                              IndexDataType,
+                                                              DInDataType_AutomicAddPreCast,
+                                                              PassThrough,
+                                                              InMemoryDataOperationEnum::AtomicAdd,
+                                                              InOutVectorSize>;
+    using GridwiseCasting = GridwiseElementwise_1D<Tuple<InOutGrid1dDesc>,
+                                                   Tuple<InOutGrid1dDesc>,
+                                                   Tuple<const DInDataType_AutomicAddPreCast*>,
+                                                   Tuple<DInDataType*>,
+                                                   UnaryConvert,
+                                                   InOutVectorSize,
+                                                   Sequence<InOutVectorSize>,
+                                                   Sequence<InOutVectorSize>>;
+    struct Argument : public BaseArgument
+    {
+        Argument(const DOutDataType* p_dout,
+                 const IndexDataType* p_indices,
+                 DInDataType* p_din,
+                 index_t dout_length,
+                 index_t din_length,
+                 const std::vector<ck::index_t>& window_lengths,
+                 const std::vector<ck::index_t>& window_strides)
+            : p_dout_{p_dout},
+              p_indices_{p_indices},
+              p_din_{p_din},
+              dout_length_raw_{dout_length},
+              din_length_raw_{din_length},
+              blockSize_{256},
+              windowOverlap_{false}
+        {
+            for(size_t i = 0; i < window_lengths.size(); ++i)
+            {
+                windowOverlap_ |= window_lengths.at(i) > window_strides.at(i);
+            }
+        }
+        const DOutDataType* p_dout_;
+        const IndexDataType* p_indices_;
+        DInDataType* p_din_;
+        index_t dout_length_raw_;
+        index_t din_length_raw_;
+        index_t blockSize_;
+        bool windowOverlap_;
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            index_t gridSize               = getAvailableComputeUnitCount(stream_config);
+            index_t loop_step              = gridSize * arg.blockSize_ * InOutVectorSize;
+            InOutGrid1dDesc din_grid_desc  = MakeDescriptor_M(arg.din_length_raw_, loop_step);
+            InOutGrid1dDesc dout_grid_desc = MakeDescriptor_M(arg.dout_length_raw_, loop_step);
+            if constexpr(is_same_v<DInDataType, float> || is_same_v<DInDataType, double>)
+            {
+                hip_check_error(hipMemsetAsync(arg.p_din_,
+                                               0,
+                                               arg.din_length_raw_ * sizeof(DInDataType),
+                                               stream_config.stream_id_));
+                if(arg.windowOverlap_)
+                {
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementAtomicAdd,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType,
+                                                                  PassThrough>;
+                    return launch_and_time_kernel(stream_config,
+                                                  put_kernel,
+                                                  dim3(gridSize),
+                                                  dim3(arg.blockSize_),
+                                                  0,
+                                                  dout_grid_desc,
+                                                  arg.p_dout_,
+                                                  arg.p_indices_,
+                                                  arg.p_din_,
+                                                  PassThrough{});
+                }
+                else
+                {
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType,
+                                                                  PassThrough>;
+                    return launch_and_time_kernel(stream_config,
+                                                  put_kernel,
+                                                  dim3(gridSize),
+                                                  dim3(arg.blockSize_),
+                                                  0,
+                                                  dout_grid_desc,
+                                                  arg.p_dout_,
+                                                  arg.p_indices_,
+                                                  arg.p_din_,
+                                                  PassThrough{});
+                }
+            }
+            else
+            {
+                if(arg.windowOverlap_)
+                {
+                    if(arg.p_workspace_ == nullptr)
+                        throw std::runtime_error("wrong! WorkSpace pointer has not been set");
+                    hip_check_error(
+                        hipMemsetAsync(arg.p_workspace_,
+                                       0,
+                                       arg.din_length_raw_ * sizeof(DInDataType_AutomicAddPreCast),
+                                       stream_config.stream_id_));
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementAtomicAdd,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType_AutomicAddPreCast,
+                                                                  PassThrough>;
+                    const auto cast_kernel =
+                        kernel_elementwise_1d<GridwiseCasting,
+                                              Tuple<InOutGrid1dDesc>,
+                                              Tuple<InOutGrid1dDesc>,
+                                              Tuple<const DInDataType_AutomicAddPreCast*>,
+                                              Tuple<DInDataType*>,
+                                              UnaryConvert>;
+                    float elapsed_time = launch_and_time_kernel(
+                        stream_config,
+                        put_kernel,
+                        dim3(gridSize),
+                        dim3(arg.blockSize_),
+                        0,
+                        dout_grid_desc,
+                        arg.p_dout_,
+                        arg.p_indices_,
+                        static_cast<DInDataType_AutomicAddPreCast*>(arg.p_workspace_),
+                        PassThrough{});
+                    elapsed_time += launch_and_time_kernel(
+                        stream_config,
+                        cast_kernel,
+                        dim3(gridSize),
+                        dim3(arg.blockSize_),
+                        0,
+                        ck::make_tuple(din_grid_desc),
+                        ck::make_tuple(din_grid_desc),
+                        static_cast<DInDataType_AutomicAddPreCast*>(arg.p_workspace_),
+                        arg.p_din_,
+                        UnaryConvert{});
+                    return elapsed_time;
+                }
+                else
+                {
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType,
+                                                                  PassThrough>;
+                    hip_check_error(hipMemsetAsync(arg.p_din_,
+                                                   0,
+                                                   arg.din_length_raw_ * sizeof(DInDataType),
+                                                   stream_config.stream_id_));
+                    return launch_and_time_kernel(stream_config,
+                                                  put_kernel,
+                                                  dim3(gridSize),
+                                                  dim3(arg.blockSize_),
+                                                  0,
+                                                  dout_grid_desc,
+                                                  arg.p_dout_,
+                                                  arg.p_indices_,
+                                                  arg.p_din_,
+                                                  PassThrough{});
+                }
+            }
+        }
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+        bool needCast = pArg_->windowOverlap_ &&
+                        !(is_same_v<DInDataType, float> || is_same_v<DInDataType, double>);
+        if(!needCast)
+            return 0;
+        else
+            return pArg_->din_length_raw_ * sizeof(DInDataType_AutomicAddPreCast);
+    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+        if(pArg->din_length_raw_ % InOutVectorSize != 0 ||
+           pArg->dout_length_raw_ % InOutVectorSize != 0)
+        {
+            return false;
+        }
+        return true;
+    }
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_dout,
+                        const void* p_indices,
+                        void* p_din,
+                        index_t dout_length,
+                        index_t din_length,
+                        std::vector<ck::index_t> window_lengths,
+                        std::vector<ck::index_t> window_strides) override
+    {
+        // Assume p_dout, p_indices, p_din are packed memory space, dout_length and din_length are
+        // physical size of the packed tensor
+        return std::make_unique<Argument>(static_cast<const DOutDataType*>(p_dout),
+                                          static_cast<const IndexDataType*>(p_indices),
+                                          static_cast<DInDataType*>(p_din),
+                                          dout_length,
+                                          din_length,
+                                          window_lengths,
+                                          window_strides);
+    }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_put_element.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// output[indices] = input
+template <typename InDataType,
+          typename IndexDataType,
+          typename OutDataType,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum MemOp,
+          ck::index_t InVectorSize>
+struct DevicePutElementImpl
+    : public DevicePutElement<InDataType, IndexDataType, OutDataType, ElementwiseOperation, MemOp>
+{
+    template <typename Desc_M>
+    static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
+    {
+        constexpr auto I0 = Number<0>{};
+        const auto m            = desc_m.GetLength(I0);
+        const index_t loop_step = gridSize * blockSize * InVectorSize;
+        const auto pad          = math::integer_least_multiple(m, loop_step) - m;
+        const auto desc_m_pad =
+            transform_tensor_descriptor(desc_m,
+                                        make_tuple(make_right_pad_transform(m, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m_pad;
+    }
+    static auto MakeDescriptor_M(index_t length, index_t gridSize, index_t blockSize)
+    {
+        const auto desc_m = make_naive_tensor_descriptor_packed(make_tuple(length));
+        return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
+    }
+    using InGrid1dDesc = decltype(MakeDescriptor_M(1, 1, 1));
+    using GridwisePutElement = GridwisePutElement_1D<InGrid1dDesc,
+                                                     InDataType,
+                                                     IndexDataType,
+                                                     OutDataType,
+                                                     ElementwiseOperation,
+                                                     MemOp,
+                                                     InVectorSize>;
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_input,
+                 const IndexDataType* p_indices,
+                 OutDataType* p_output,
+                 index_t input_length,
+                 ElementwiseOperation elementwise_op)
+            : p_input_{p_input},
+              p_indices_{p_indices},
+              p_output_{p_output},
+              input_length_raw_{input_length},
+              elementwise_op_{elementwise_op},
+              blockSize_{256}
+        {
+        }
+        const InDataType* p_input_;
+        const IndexDataType* p_indices_;
+        OutDataType* p_output_;
+        index_t input_length_raw_;
+        ElementwiseOperation elementwise_op_;
+        index_t blockSize_;
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            index_t gridSize = getAvailableComputeUnitCount(stream_config);
+            InGrid1dDesc in_grid_desc =
+                MakeDescriptor_M(arg.input_length_raw_, gridSize, arg.blockSize_);
+            const auto kernel = kernel_put_element_1d<GridwisePutElement,
+                                                      InGrid1dDesc,
+                                                      InDataType,
+                                                      IndexDataType,
+                                                      OutDataType,
+                                                      ElementwiseOperation>;
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(gridSize),
+                                                        dim3(arg.blockSize_),
+                                                        0,
+                                                        in_grid_desc,
+                                                        arg.p_input_,
+                                                        arg.p_indices_,
+                                                        arg.p_output_,
+                                                        arg.elementwise_op_);
+            return elapsed_time;
+        }
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+        if(pArg->input_length_raw_ % InVectorSize != 0)
+        {
+            return false;
+        }
+        return true;
+    }
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_input,
+                                                      const void* p_indices,
+                                                      void* p_output,
+                                                      index_t input_length,
+                                                      index_t,
+                                                      ElementwiseOperation elementwise_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_input),
+                                          static_cast<const IndexDataType*>(p_indices),
+                                          static_cast<OutDataType*>(p_output),
+                                          input_length,
+                                          elementwise_op);
+    }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+namespace ck {
+template <typename GridwisePutElementwise1dFunctor,
+          typename InGrid1dDesc,
+          typename InDataType,
+          typename IndexDataType,
+          typename OutDataType,
+          typename ElementwiseOperation>
+__global__ void kernel_put_element_1d(const InGrid1dDesc in_grid_1d_desc,
+                                      const InDataType* __restrict__ p_in_global,
+                                      const IndexDataType* __restrict__ p_indices_global,
+                                      OutDataType* __restrict__ p_out_global,
+                                      const ElementwiseOperation elementwise_op)
+{
+    GridwisePutElementwise1dFunctor::Run(
+        in_grid_1d_desc, p_in_global, p_indices_global, p_out_global, elementwise_op);
+}
+// output[indices] = input
+template <typename InGrid1dDesc,
+          typename InDataType,
+          typename IndexDataType,
+          typename OutDataType,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum MemOp,
+          index_t InVectorSize>
+struct GridwisePutElement_1D
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto thread_buffer_desc_m =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<InVectorSize>{}));
+    __device__ static void Run(const InGrid1dDesc& in_grid_1d_desc,
+                               const InDataType* __restrict__ p_in_global,
+                               const IndexDataType* __restrict__ p_indices_global,
+                               OutDataType* __restrict__ p_out_global,
+                               const ElementwiseOperation& elementwise_op)
+    {
+        // Global Memory
+        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_global, in_grid_1d_desc.GetElementSpaceSize());
+        const auto indices_global_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Global>(p_indices_global,
+                                                          in_grid_1d_desc.GetElementSpaceSize(),
+                                                          NumericLimits<IndexDataType>::Lowest());
+        // VGPR
+        StaticBuffer<AddressSpaceEnum::Vgpr, InDataType, InVectorSize, true> in_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, InVectorSize, true> indices_thread_buf;
+        // Thread id, Block id and index
+        const index_t thread_global_id  = get_thread_global_1d_id();
+        const auto thread_global_offset = make_multi_index(thread_global_id * InVectorSize);
+        const index_t blockSize         = get_block_size();
+        const index_t blockPerGrid      = get_grid_size();
+        const auto M                    = in_grid_1d_desc.GetLength(I0);
+        const index_t loop_step         = blockPerGrid * blockSize * InVectorSize;
+        const auto loop_step_index      = make_multi_index(loop_step);
+        auto in_global_load =
+            ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                             InDataType,
+                                             decltype(in_grid_1d_desc),
+                                             decltype(thread_buffer_desc_m),
+                                             Sequence<InVectorSize>, // SliceLengths
+                                             Sequence<0>,            // DimAccessOrder
+                                             0,                      // SrcVectorDim
+                                             InVectorSize,           // ScalarPerVector
+                                             1,                      // SrcScalarStrideInVector
+                                             false>{in_grid_1d_desc, thread_global_offset};
+        auto indices_global_load =
+            ThreadwiseTensorSliceTransfer_v2<IndexDataType,
+                                             IndexDataType,
+                                             decltype(in_grid_1d_desc),
+                                             decltype(thread_buffer_desc_m),
+                                             Sequence<InVectorSize>, // SliceLengths
+                                             Sequence<0>,            // DimAccessOrder
+                                             0,                      // SrcVectorDim
+                                             InVectorSize,           // ScalarPerVector
+                                             1,                      // SrcScalarStrideInVector
+                                             false>{in_grid_1d_desc, thread_global_offset};
+        index_t num_iter = M / loop_step;
+        do
+        {
+            in_global_load.Run(in_grid_1d_desc,
+                               in_global_buf,
+                               thread_buffer_desc_m,
+                               make_tuple(I0),
+                               in_thread_buf);
+            in_global_load.MoveSrcSliceWindow(in_grid_1d_desc, loop_step_index);
+            static_for<0, InVectorSize, 1>{}(
+                [&](auto iM) { elementwise_op(in_thread_buf(iM), in_thread_buf[iM]); });
+            indices_global_load.Run(in_grid_1d_desc,
+                                    indices_global_buf,
+                                    thread_buffer_desc_m,
+                                    make_tuple(I0),
+                                    indices_thread_buf);
+            indices_global_load.MoveSrcSliceWindow(in_grid_1d_desc, loop_step_index);
+            static_for<0, InVectorSize, 1>{}([&](auto iM) {
+                if(indices_thread_buf[iM] >= 0)
+                {
+                    if constexpr(MemOp == InMemoryDataOperationEnum::Set)
+                    {
+                        // User should guarantee each index in p_indices_global is different
+                        *(p_out_global + indices_thread_buf[iM]) =
+                            ck::type_convert<OutDataType>(in_thread_buf[iM]);
+                    }
+                    else if constexpr(MemOp == InMemoryDataOperationEnum::AtomicAdd)
+                    {
+                        atomic_add<OutDataType>(p_out_global + indices_thread_buf[iM],
+                                                ck::type_convert<OutDataType>(in_thread_buf[iM]));
+                    }
+                    else if constexpr(MemOp == InMemoryDataOperationEnum::AtomicMax)
+                    {
+                        atomic_max<OutDataType>(p_out_global + indices_thread_buf[iM],
+                                                ck::type_convert<OutDataType>(in_thread_buf[iM]));
+                    }
+                    else if constexpr(MemOp == InMemoryDataOperationEnum::Add)
+                    {
+                        // User should guarantee each index in p_indices_global is different
+                        *(p_out_global + indices_thread_buf[iM]) +=
+                            ck::type_convert<OutDataType>(in_thread_buf[iM]);
+                    }
+                    else
+                    {
+                        static_assert(MemOp == InMemoryDataOperationEnum::Set ||
+                                      MemOp == InMemoryDataOperationEnum::AtomicAdd ||
+                                      MemOp == InMemoryDataOperationEnum::AtomicMax ||
+                                      MemOp == InMemoryDataOperationEnum::Add);
+                    }
+                }
+            });
+        } while(--num_iter);
+    }
+};
+} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace host {
+using namespace std;
+template <typename DOutDataType,
+          typename IndexDataType,
+          typename ConputeDataType,
+          typename DInDataType,
+          typename ElementwiseOperation>
+struct ReferenceMaxPoolBwd : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<DOutDataType>& dout,
+                 const Tensor<IndexDataType>& indices,
+                 Tensor<DInDataType>& din,
+                 ElementwiseOperation elementwise_op)
+            : dout_(dout), indices_(indices), din_(din), elementwise_op_(elementwise_op)
+        {
+        }
+        const Tensor<DOutDataType>& dout_;
+        const Tensor<IndexDataType>& indices_;
+        Tensor<DInDataType>& din_;
+        ElementwiseOperation elementwise_op_;
+    };
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            int din_length  = arg.din_.GetElementSpaceSize();
+            int dout_length = arg.dout_.GetElementSpaceSize();
+            std::vector<ConputeDataType> buf(din_length, 0);
+            for(int i = 0; i < dout_length; ++i)
+            {
+                int index = arg.indices_.mData[i];
+                if(index >= 0 && index < din_length)
+                    buf[index] += ck::type_convert<ConputeDataType>(arg.dout_.mData[i]);
+            }
+            for(int i = 0; i < din_length; ++i)
+                arg.din_.mData[i] = ck::type_convert<DInDataType>(buf[i]);
+            return 0;
+        }
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+    static auto MakeArgument(const Tensor<DOutDataType>& dout,
+                             const Tensor<IndexDataType>& indices,
+                             Tensor<DInDataType>& din,
+                             ElementwiseOperation elementwise_op)
+    {
+        return Argument{dout, indices, din, elementwise_op};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "ReferenceMaxPoolBwd"
+            << std::endl;
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
@@ -100,8 +100,8 @@ struct ReferencePoolingFwd : public device::BaseOperator
                                   wi >= 0 &&
                                   wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[4]))
                                {
-                                    ComputeDataType currVal =
+                                    ComputeDataType currVal = ck::type_convert<ComputeDataType>(
-                                        static_cast<ComputeDataType>(arg.in_(n, c, di, hi, wi));
+                                        arg.in_(n, c, di, hi, wi));
                                    in_elementwise_op(currVal, currVal);
@@ -112,7 +112,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
                    }
                    acc_elementwise_op(accuVal, accuVal);
-                    arg.out_(n, c, do_, ho, wo) = accuVal;
+                    arg.out_(n, c, do_, ho, wo) = ck::type_convert<OutDataType>(accuVal);
                };
                make_ParallelTensorFunctor(f_ncdhw,
@@ -151,8 +151,8 @@ struct ReferencePoolingFwd : public device::BaseOperator
                                   wi >= 0 &&
                                   wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[4]))
                                {
-                                    ComputeDataType currVal =
+                                    ComputeDataType currVal = ck::type_convert<ComputeDataType>(
-                                        static_cast<ComputeDataType>(arg.in_(n, c, di, hi, wi));
+                                        arg.in_(n, c, di, hi, wi));
                                    IndexDataType currIndex =
                                        arg.in_.GetOffsetFromMultiIndex(n, c, di, hi, wi);
@@ -166,7 +166,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
                    acc_elementwise_op(accuVal, accuVal);
-                    arg.out_(n, c, do_, ho, wo)         = accuVal;
+                    arg.out_(n, c, do_, ho, wo)         = ck::type_convert<OutDataType>(accuVal);
                    arg.out_indices_(n, c, do_, ho, wo) = accuIndex;
                };
@@ -212,7 +212,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
                               wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]))
                            {
                                ComputeDataType currVal =
-                                    static_cast<ComputeDataType>(arg.in_(n, c, hi, wi));
+                                    ck::type_convert<ComputeDataType>(arg.in_(n, c, hi, wi));
                                in_elementwise_op(currVal, currVal);
@@ -222,7 +222,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
                    }
                    acc_elementwise_op(accuVal, accuVal);
-                    arg.out_(n, c, ho, wo) = accuVal;
+                    arg.out_(n, c, ho, wo) = ck::type_convert<OutDataType>(accuVal);
                };
                make_ParallelTensorFunctor(f_nchw,
@@ -255,7 +255,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
                               wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]))
                            {
                                ComputeDataType currVal =
-                                    static_cast<ComputeDataType>(arg.in_(n, c, hi, wi));
+                                    ck::type_convert<ComputeDataType>(arg.in_(n, c, hi, wi));
                                IndexDataType currIndex =
                                    arg.in_.GetOffsetFromMultiIndex(n, c, hi, wi);
@@ -268,7 +268,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
                    }
                    acc_elementwise_op(accuVal, accuVal);
-                    arg.out_(n, c, ho, wo)         = accuVal;
+                    arg.out_(n, c, ho, wo)         = ck::type_convert<OutDataType>(accuVal);
                    arg.out_indices_(n, c, ho, wo) = accuIndex;
                };