Refactor pool fwd (#815)

* Do not hardcode stride * devicePool2DFwd Inherit devicePool3DFwd * Move instance declaration out of common * Add dilation * use the pool3d rank, because pool2d inherit pooo3d * calculate Do Ho Wo for the dilation * Fix header name * Modify ckProfiler * Remove pool2d instance * Remove pool2d in profiler * Remove pool2d and add dilation * In to client example, this commit revise following: 1. Add dilation. 2. Use pool3d to implement pool2d * Refine naming and IsSupportedArgument() * Add dilation to maxpool bwd example * clang format * 1. Remove useless header 2. Fix copyright 3. Refine naming * Add layout parameter to pool fwd * clang format * Fix merge error * Fix compile error * Remove layout parameter in derived class * Refine changlog * Fix compile error * Fix compiler error * Add layout to external api and profiler

Refactor pool fwd (#815)
* Do not hardcode stride * devicePool2DFwd Inherit devicePool3DFwd * Move instance declaration out of common * Add dilation * use the pool3d rank, because pool2d inherit pooo3d * calculate Do Ho Wo for the dilation * Fix header name * Modify ckProfiler * Remove pool2d instance * Remove pool2d in profiler * Remove pool2d and add dilation * In to client example, this commit revise following: 1. Add dilation. 2. Use pool3d to implement pool2d * Refine naming and IsSupportedArgument() * Add dilation to maxpool bwd example * clang format * 1. Remove useless header 2. Fix copyright 3. Refine naming * Add layout parameter to pool fwd * clang format * Fix merge error * Fix compile error * Remove layout parameter in derived class * Refine changlog * Fix compile error * Fix compiler error * Add layout to external api and profiler
f60f0a5e · rocking · GitHub · 03b8119e · f60f0a5e · f60f0a5e
Unverified Commit f60f0a5e authored Aug 15, 2023 by rocking Committed by GitHub Aug 15, 2023
19 changed files
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
@@ -11,7 +11,9 @@ namespace instance {
 static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
 void add_device_pool3d_fwd_ndhwc_f32_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, ReduceOpId, false>>>& instances)
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
 {
    add_device_operation_instances(
        instances, device_pool3d_fwd_ndhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});

--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
@@ -11,14 +11,18 @@ namespace instance {
 static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
 void add_device_pool3d_fwd_ndhwc_f16_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F16, F16, I32, ReduceOpId, false>>>& instances)
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<5, 3, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
 {
    add_device_operation_instances(
        instances, device_pool3d_fwd_ndhwc_instances<F16, F16, I32, F16, ReduceOpId, false>{});
 }
 void add_device_pool3d_fwd_ndhwc_index_f16_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F16, F16, I32, ReduceOpId, true>>>& instances)
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<5, 3, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, true>>>&
+        instances)
 {
    add_device_operation_instances(
        instances, device_pool3d_fwd_ndhwc_instances<F16, F16, I32, F16, ReduceOpId, true>{});

--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
@@ -11,14 +11,18 @@ namespace instance {
 static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
 void add_device_pool3d_fwd_ndhwc_f32_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, ReduceOpId, false>>>& instances)
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
 {
    add_device_operation_instances(
        instances, device_pool3d_fwd_ndhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
 }
 void add_device_pool3d_fwd_ndhwc_index_f32_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, ReduceOpId, true>>>& instances)
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, true>>>&
+        instances)
 {
    add_device_operation_instances(
        instances, device_pool3d_fwd_ndhwc_instances<F32, F32, I32, F32, ReduceOpId, true>{});

--- a/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
@@ -15,24 +15,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-using I32 = int32_t;
+using I32   = int32_t;
-using F16 = ck::half_t;
+using F16   = ck::half_t;
-using F32 = float;
+using F32   = float;
+using NDHWC = ck::tensor_layout::convolution::NDHWC;
-template <typename InDataType,
-          typename OutDataType,
-          typename IndexDataType,
-          typename ComputeDataType,
-          ReduceTensorOp ReduceOpId,
-          bool OutputIndex>
-using device_pool2d_fwd_nhwc_instances =
-    // clang-format off
-    std::tuple <
-        DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 1, 1, 1>,
-        DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 2, 1, 2>,
-        DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 4, 1, 4>
-               // clang-format on
-               >;
 template <typename InDataType,
          typename OutDataType,
@@ -43,9 +29,9 @@ template <typename InDataType,
 using device_pool3d_fwd_ndhwc_instances =
    // clang-format off
    std::tuple <
-        DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 1, 1, 1>,
+        DevicePool3dFwd_NDHWC_NDHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 1, 1, 1>,
-        DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 2, 1, 2>,
+        DevicePool3dFwd_NDHWC_NDHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 2, 1, 2>,
-        DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 4, 1, 4>
+        DevicePool3dFwd_NDHWC_NDHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 4, 1, 4>
               // clang-format on
               >;

--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "pool_fwd_instance_common.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
-void add_device_pool2d_fwd_nhwc_f16_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, false>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "pool_fwd_instance_common.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
-void add_device_pool2d_fwd_nhwc_f32_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, ReduceOpId, false>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "pool_fwd_instance_common.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
-void add_device_pool2d_fwd_nhwc_f16_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, false>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F16, ReduceOpId, false>{});
-}
-void add_device_pool2d_fwd_nhwc_index_f16_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, true>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F16, ReduceOpId, true>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "pool_fwd_instance_common.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
-void add_device_pool2d_fwd_nhwc_f32_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, ReduceOpId, false>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
-}
-void add_device_pool2d_fwd_nhwc_index_f32_instances(
-    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, ReduceOpId, true>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, true>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include <iomanip>
-#include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
-namespace ck {
-namespace profiler {
-template <typename InDataType,
-          typename OutDataType,
-          typename ComputeDataType,
-          typename IndexDataType,
-          ck::ReduceTensorOp ReduceOpId,
-          bool PropagateNan,
-          bool OutputIndex>
-bool profile_pool2d_fwd_impl(int do_verification,
-                             int init_method,
-                             bool do_log,
-                             bool time_kernel,
-                             std::vector<index_t> in_length, // NCHW
-                             std::vector<index_t> window_spatial_lengths,
-                             std::vector<index_t> window_strides,
-                             std::vector<index_t> input_left_pads,
-                             std::vector<index_t> input_right_pads)
-{
-    constexpr index_t InOutRank  = 4;
-    constexpr index_t WindowRank = 2;
-    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
-       window_strides.size() != WindowRank || input_left_pads.size() != WindowRank ||
-       input_right_pads.size() != WindowRank)
-        return false;
-    std::vector<index_t> out_length(InOutRank);
-    int N = in_length[0];
-    int C = in_length[1];
-    out_length[0] = N;
-    out_length[1] = C;
-    // Calculate Ho, Wo
-    for(int i = 2; i < InOutRank; ++i)
-    {
-        auto pad1           = input_left_pads[i - 2];
-        auto pad2           = input_right_pads[i - 2];
-        auto windows_size   = window_spatial_lengths[i - 2];
-        auto windows_stride = window_strides[i - 2];
-        out_length[i]       = (in_length[i] + pad1 + pad2 - windows_size) / windows_stride + 1;
-    }
-    int Hi = in_length[2];
-    int Wi = in_length[3];
-    int Ho = out_length[2];
-    int Wo = out_length[3];
-    auto f_host_tensor_descriptor =
-        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
-            using namespace ck::literals;
-            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
-        };
-    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
-    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
-    Tensor<IndexDataType> out_indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
-    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
-    Tensor<IndexDataType> out_indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
-    switch(init_method)
-    {
-    case 0: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
-    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
-    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
-    }
-    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
-    DeviceMem out_device_buf(sizeof(OutDataType) *
-                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
-    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
-                                     out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
-    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    // add device normalization instances
-    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
-                                                                 WindowRank,
-                                                                 InDataType,
-                                                                 OutDataType,
-                                                                 IndexDataType,
-                                                                 ReduceOpId,
-                                                                 OutputIndex>;
-    // get device op instances
-    const auto instance_ptrs =
-        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-            DeviceOp>::GetInstances();
-    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
-    std::string best_instance_name;
-    float best_avg_time   = std::numeric_limits<float>::max();
-    float best_gb_per_sec = 0;
-    if(do_verification)
-    {
-        using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
-                                                                                  WindowRank,
-                                                                                  InDataType,
-                                                                                  OutDataType,
-                                                                                  ComputeDataType,
-                                                                                  IndexDataType,
-                                                                                  ReduceOpId,
-                                                                                  PropagateNan,
-                                                                                  OutputIndex>;
-        ReferenceInstance ref;
-        auto ref_argument = ref.MakeArgument(in_n_c_hi_wi,
-                                             out_n_c_ho_wo_host,
-                                             out_indices_n_c_ho_wo_host,
-                                             window_spatial_lengths,
-                                             window_strides,
-                                             input_left_pads,
-                                             input_right_pads);
-        auto ref_invoker  = ref.MakeInvoker();
-        ref_invoker.Run(ref_argument);
-    }
-    int num_kernel = 0;
-    for(auto& inst_ptr : instance_ptrs)
-    {
-        auto argument_ptr = inst_ptr->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
-            in_length,
-            window_spatial_lengths,
-            out_length,
-            {C * Hi * Wi, 1, Wi * C, C},
-            {C * Ho * Wo, 1, Wo * C, C},
-            {C * Ho * Wo, 1, Wo * C, C},
-            window_strides,
-            input_left_pads,
-            input_right_pads,
-            {2, 3});
-        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            ++num_kernel;
-        }
-        else
-        {
-            if(time_kernel)
-            {
-                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
-                LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl;
-            }
-            continue;
-        }
-        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
-        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-        std::size_t num_bytes = in_n_c_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) +
-                                out_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType);
-        if constexpr(OutputIndex)
-            num_bytes += out_indices_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(IndexDataType);
-        float gb_per_sec = num_bytes / 1.E6 / avg_time;
-        if(time_kernel)
-            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
-                      << inst_ptr->GetTypeString() << std::endl;
-        if(avg_time < best_avg_time)
-        {
-            best_instance_name = inst_ptr->GetTypeString();
-            best_avg_time      = avg_time;
-            best_gb_per_sec    = gb_per_sec;
-        }
-        if(do_verification)
-        {
-            out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
-            bool pass = ck::utils::check_err(out_n_c_ho_wo_device.mData,
-                                             out_n_c_ho_wo_host.mData,
-                                             "Error: Incorrect results",
-                                             1e-3,
-                                             1e-3);
-            if constexpr(OutputIndex)
-            {
-                out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
-                pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device,
-                                                    out_indices_n_c_ho_wo_host);
-            }
-            if(do_log)
-            {
-                LogRangeAsType<float>(std::cout << "in_n_c_hi_wi  : ", in_n_c_hi_wi.mData, ",")
-                    << std::endl;
-                LogRangeAsType<float>(
-                    std::cout << "out_n_c_ho_wo_host  : ", out_n_c_ho_wo_host.mData, ",")
-                    << std::endl;
-                LogRangeAsType<float>(
-                    std::cout << "out_n_c_ho_wo_device  : ", out_n_c_ho_wo_device.mData, ",")
-                    << std::endl;
-                if constexpr(OutputIndex)
-                    LogRangeAsType<float>(std::cout << "out_indices_n_c_ho_wo_device  : ",
-                                          out_indices_n_c_ho_wo_device.mData,
-                                          ",")
-                        << std::endl;
-            }
-            if(!pass)
-            {
-                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
-                LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl;
-                return false;
-            }
-            else
-            {
-                if(time_kernel)
-                    std::cout << "pass" << std::endl;
-            }
-        }
-    }
-    if(time_kernel)
-    {
-        LogRange(std::cout << "length = ", in_length, ",") << std::endl;
-        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
-                  << best_instance_name << std::endl;
-    }
-    if(num_kernel == 0)
-    {
-        std::cout << "Error: No kernel is applicable" << std::endl;
-        return false;
-    }
-    return true;
-}
-} // namespace profiler
-} // namespace ck
--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
@@ -21,6 +21,8 @@ template <typename InDataType,
          typename OutDataType,
          typename ComputeDataType,
          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
          ck::ReduceTensorOp ReduceOpId,
          bool PropagateNan,
          bool OutputIndex>
@@ -31,6 +33,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
                             std::vector<index_t> in_length, // NCDHW
                             std::vector<index_t> window_spatial_lengths,
                             std::vector<index_t> window_strides,
+                             std::vector<index_t> window_dilations,
                             std::vector<index_t> input_left_pads,
                             std::vector<index_t> input_right_pads)
 {
@@ -38,8 +41,8 @@ bool profile_pool3d_fwd_impl(int do_verification,
    constexpr index_t WindowRank = 3;
    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
-       window_strides.size() != WindowRank || input_left_pads.size() != WindowRank ||
+       window_strides.size() != WindowRank || window_dilations.size() != WindowRank ||
-       input_right_pads.size() != WindowRank)
+       input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
        return false;
    std::vector<index_t> out_length(InOutRank);
@@ -53,11 +56,13 @@ bool profile_pool3d_fwd_impl(int do_verification,
    // Calculate Do, Ho, Wo
    for(int i = 2; i < InOutRank; ++i)
    {
-        auto pad1           = input_left_pads[i - 2];
+        auto pad1             = input_left_pads[i - 2];
-        auto pad2           = input_right_pads[i - 2];
+        auto pad2             = input_right_pads[i - 2];
-        auto windows_size   = window_spatial_lengths[i - 2];
+        auto windows_size     = window_spatial_lengths[i - 2];
-        auto windows_stride = window_strides[i - 2];
+        auto windows_stride   = window_strides[i - 2];
-        out_length[i]       = (in_length[i] + pad1 + pad2 - windows_size) / windows_stride + 1;
+        auto windows_dilation = window_dilations[i - 2];
+        auto eff              = (windows_size - 1) * windows_dilation + 1;
+        out_length[i]         = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
    }
    int Di = in_length[2];
@@ -104,6 +109,8 @@ bool profile_pool3d_fwd_impl(int do_verification,
                                                                 InDataType,
                                                                 OutDataType,
                                                                 IndexDataType,
+                                                                 InLayout,
+                                                                 OutLayout,
                                                                 ReduceOpId,
                                                                 OutputIndex>;
@@ -136,6 +143,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
                                             out_indices_n_c_do_ho_wo_host,
                                             window_spatial_lengths,
                                             window_strides,
+                                             window_dilations,
                                             input_left_pads,
                                             input_right_pads);
        auto ref_invoker  = ref.MakeInvoker();
@@ -157,6 +165,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
            {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
            {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
            window_strides,
+            window_dilations,
            input_left_pads,
            input_right_pads,
            {2, 3, 4});

--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -17,7 +17,6 @@ set(PROFILER_SOURCES
    profile_reduce.cpp
    profile_groupnorm.cpp
    profile_layernorm.cpp
-    profile_avg_pool2d_fwd.cpp
    profile_max_pool3d_fwd.cpp
    profile_softmax.cpp
    profile_batchnorm_fwd.cpp
@@ -74,7 +73,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
 if(DL_KERNELS)

--- a/profiler/src/profile_avg_pool2d_fwd.cpp
+++ b/profiler/src/profile_avg_pool2d_fwd.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <iostream>
-#include <vector>
-#include <unordered_map>
-#include "profiler/data_type_enum.hpp"
-#include "profiler/profile_pool2d_fwd_impl.hpp"
-#include "profiler_operation_registry.hpp"
-using ck::index_t;
-struct avgPoolFwdArgParser
-{
-    std::unordered_map<std::string, std::vector<int>> long_opts = {
-        {"length", {}}, {"wsize", {}}, {"wstride", {}}, {"pad1", {}}, {"pad2", {}}};
-    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
-    {
-        if(std::string("--") + key == argv[i])
-        {
-            int pos = i;
-            while(++i < argc && argv[i][0] != '-') {}
-            int end = i;
-            for(int j = pos + 1; j < end; j++)
-            {
-                long_opts[key].push_back(std::stoi(argv[j]));
-            }
-            return true;
-        }
-        return false;
-    }
-    void operator()(int argc, char* argv[])
-    {
-        for(auto& kv : long_opts)
-        {
-            for(int i = 1; i < argc; i++)
-            {
-                if(parse_opt(argc, argv, kv.first, i))
-                    break;
-            }
-        }
-    }
-};
-void print_help_avg_pool2d_fwd()
-{
-    std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
-              << "arg2: verification (0: no; 1: yes)\n"
-              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
-              << "arg4: print tensor value (0: no; 1: yes)\n"
-              << "arg5: time kernel (0=no, 1=yes)\n"
-              << "--length: input tensor length for NDHW(e.g, --length 2 32 30 30) \n"
-              << "--wsize: window size for YX (e.g, --wsize 2 2) \n"
-              << "--wstride: window stride for HW (e.g, --wstride 2 2) \n"
-              << "--pad1: left side of padding in HW (e.g, --pad1 1 1) \n"
-              << "--pad2: right side of padding in HW (e.g, --pad2 1 1) \n"
-              << "eg: ckProfiler avg_pool2d_fwd 0 1 2 0 1 0 --length 2 32 30 30 --wsize 2 2 "
-                 "--wstride 2 2 --pad1 1 1 --pad2 1 1"
-              << std::endl;
-}
-int profile_avg_pool2d_fwd(int argc, char* argv[])
-{
-    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
-    bool do_verification       = true;
-    int init_method            = 0;
-    bool do_log                = false;
-    bool time_kernel           = true;
-    std::vector<index_t> in_length = {2, 32, 30, 30};
-    std::vector<index_t> wsize     = {2, 2};
-    std::vector<index_t> wstride   = {2, 2};
-    std::vector<index_t> pad1      = {1, 1};
-    std::vector<index_t> pad2      = {1, 1};
-    if(argc != 2 && argc != 25)
-    {
-        print_help_avg_pool2d_fwd();
-        return 0;
-    }
-    else if(argc == 25)
-    {
-        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
-        do_verification = std::stoi(argv[3]);
-        init_method     = std::stoi(argv[4]);
-        do_log          = std::stoi(argv[5]);
-        time_kernel     = std::stoi(argv[6]);
-        // parse the long options
-        avgPoolFwdArgParser arg_parser;
-        arg_parser(argc, argv);
-        in_length = arg_parser.long_opts["length"];
-        wsize     = arg_parser.long_opts["wsize"];
-        wstride   = arg_parser.long_opts["wstride"];
-        pad1      = arg_parser.long_opts["pad1"];
-        pad2      = arg_parser.long_opts["pad2"];
-    }
-    using F16                 = ck::half_t;
-    using F32                 = float;
-    using I32                 = int32_t;
-    constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
-    if(data_type == ck::DataTypeEnum::Half)
-    {
-        ck::profiler::profile_pool2d_fwd_impl<F16, F16, F32, I32, ReduceOpId, false, false>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            in_length,
-            wsize,
-            wstride,
-            pad1,
-            pad2);
-    }
-    else if(data_type == ck::DataTypeEnum::Float)
-    {
-        ck::profiler::profile_pool2d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, false>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            in_length,
-            wsize,
-            wstride,
-            pad1,
-            pad2);
-    }
-    else
-    {
-        throw std::runtime_error("not implemented yet");
-    }
-    return 0;
-}
-REGISTER_PROFILER_OPERATION("avg_pool2d_fwd", "avg_pool2d fwd", profile_avg_pool2d_fwd);
--- a/profiler/src/profile_max_pool3d_fwd.cpp
+++ b/profiler/src/profile_max_pool3d_fwd.cpp
@@ -13,8 +13,12 @@ using ck::index_t;
 struct maxPoolFwdArgParser
 {
-    std::unordered_map<std::string, std::vector<int>> long_opts = {
+    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
-        {"length", {}}, {"wsize", {}}, {"wstride", {}}, {"pad1", {}}, {"pad2", {}}};
+                                                                   {"wsize", {}},
+                                                                   {"wstride", {}},
+                                                                   {"wdilation", {}},
+                                                                   {"pad1", {}},
+                                                                   {"pad2", {}}};
    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
    {
@@ -56,10 +60,11 @@ void print_help_max_pool3d_fwd()
              << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
              << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
              << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
+              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
              << "eg: ckProfiler max_pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
-                 "--wstride 2 2 2 --pad1 1 1 1 --pad2 1 1 1"
+                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
              << std::endl;
 }
@@ -75,15 +80,16 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
    std::vector<index_t> in_length = {2, 32, 30, 30, 30};
    std::vector<index_t> wsize     = {2, 2, 2};
    std::vector<index_t> wstride   = {2, 2, 2};
+    std::vector<index_t> wdilation = {1, 1, 1};
    std::vector<index_t> pad1      = {1, 1, 1};
    std::vector<index_t> pad2      = {1, 1, 1};
-    if(argc != 2 && argc != 30)
+    if(argc != 2 && argc != 34)
    {
        print_help_max_pool3d_fwd();
        return 0;
    }
-    else if(argc == 30)
+    else if(argc == 34)
    {
        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
        do_verification = std::stoi(argv[3]);
@@ -98,64 +104,79 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
        in_length = arg_parser.long_opts["length"];
        wsize     = arg_parser.long_opts["wsize"];
        wstride   = arg_parser.long_opts["wstride"];
+        wdilation = arg_parser.long_opts["wdilation"];
        pad1      = arg_parser.long_opts["pad1"];
        pad2      = arg_parser.long_opts["pad2"];
    }
-    using F16                 = ck::half_t;
+    using F16   = ck::half_t;
-    using F32                 = float;
+    using F32   = float;
-    using I32                 = int32_t;
+    using I32   = int32_t;
+    using NDHWC = ck::tensor_layout::convolution::NDHWC;
+#if 1
    constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+#else
+    constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+#endif
    if(data_type == ck::DataTypeEnum::Half)
    {
        if(return_index)
-            ck::profiler::profile_pool3d_fwd_impl<F16, F16, F16, I32, ReduceOpId, false, true>(
+            ck::profiler::
-                do_verification,
+                profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
-                init_method,
+                    do_verification,
-                do_log,
+                    init_method,
-                time_kernel,
+                    do_log,
-                in_length,
+                    time_kernel,
-                wsize,
+                    in_length,
-                wstride,
+                    wsize,
-                pad1,
+                    wstride,
-                pad2);
+                    wdilation,
+                    pad1,
+                    pad2);
        else
-            ck::profiler::profile_pool3d_fwd_impl<F16, F16, F16, I32, ReduceOpId, false, false>(
+            ck::profiler::
-                do_verification,
+                profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
-                init_method,
+                    do_verification,
-                do_log,
+                    init_method,
-                time_kernel,
+                    do_log,
-                in_length,
+                    time_kernel,
-                wsize,
+                    in_length,
-                wstride,
+                    wsize,
-                pad1,
+                    wstride,
-                pad2);
+                    wdilation,
+                    pad1,
+                    pad2);
    }
    else if(data_type == ck::DataTypeEnum::Float)
    {
        if(return_index)
-            ck::profiler::profile_pool3d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, true>(
+            ck::profiler::
-                do_verification,
+                profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
-                init_method,
+                    do_verification,
-                do_log,
+                    init_method,
-                time_kernel,
+                    do_log,
-                in_length,
+                    time_kernel,
-                wsize,
+                    in_length,
-                wstride,
+                    wsize,
-                pad1,
+                    wstride,
-                pad2);
+                    wdilation,
+                    pad1,
+                    pad2);
        else
-            ck::profiler::profile_pool3d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, false>(
+            ck::profiler::
-                do_verification,
+                profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
-                init_method,
+                    do_verification,
-                do_log,
+                    init_method,
-                time_kernel,
+                    do_log,
-                in_length,
+                    time_kernel,
-                wsize,
+                    in_length,
-                wstride,
+                    wsize,
-                pad1,
+                    wstride,
-                pad2);
+                    wdilation,
+                    pad1,
+                    pad2);
    }
    else
    {

--- a/test/pool_fwd/CMakeLists.txt
+++ b/test/pool_fwd/CMakeLists.txt
 add_custom_target(test_pool_fwd)
-add_gtest_executable(test_avg_pool2d_fwd test_avg_pool2d_fwd.cpp)
 add_gtest_executable(test_avg_pool3d_fwd test_avg_pool3d_fwd.cpp)
-add_gtest_executable(test_max_pool2d_fwd test_max_pool2d_fwd.cpp)
 add_gtest_executable(test_max_pool3d_fwd test_max_pool3d_fwd.cpp)
-target_link_libraries(test_avg_pool2d_fwd PRIVATE utility device_pool_fwd_instance)
+target_link_libraries(test_avg_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance)
-target_link_libraries(test_avg_pool3d_fwd PRIVATE utility device_pool_fwd_instance)
+target_link_libraries(test_max_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance)
-target_link_libraries(test_max_pool2d_fwd PRIVATE utility device_pool_fwd_instance)
-target_link_libraries(test_max_pool3d_fwd PRIVATE utility device_pool_fwd_instance)
-add_dependencies(test_pool_fwd test_avg_pool2d_fwd)
 add_dependencies(test_pool_fwd test_avg_pool3d_fwd)
-add_dependencies(test_pool_fwd test_max_pool2d_fwd)
 add_dependencies(test_pool_fwd test_max_pool3d_fwd)
--- a/test/pool_fwd/test_avg_pool2d_fwd.cpp
+++ b/test/pool_fwd/test_avg_pool2d_fwd.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "profiler/profile_pool2d_fwd_impl.hpp"
-#include "test_pool_fwd_common.hpp"
-template <typename Tuple>
-class TestAvgPool2dFwd : public ::testing::Test
-{
-    protected:
-    using InDataType      = std::tuple_element_t<0, Tuple>;
-    using OutDataType     = std::tuple_element_t<1, Tuple>;
-    using ComputeDataType = std::tuple_element_t<2, Tuple>;
-    using IndexDataType   = std::tuple_element_t<3, Tuple>;
-    std::vector<PoolingParam> params;
-    void Run()
-    {
-        for(auto param : params)
-        {
-            bool success =
-                ck::profiler::profile_pool2d_fwd_impl<InDataType,
-                                                      OutDataType,
-                                                      ComputeDataType,
-                                                      IndexDataType,
-                                                      ck::ReduceTensorOp::AVG,
-                                                      false,
-                                                      false>(true,
-                                                             2,
-                                                             false,
-                                                             false,
-                                                             param.length_,
-                                                             param.window_spatial_lengths_,
-                                                             param.window_strides_,
-                                                             param.input_left_pads_,
-                                                             param.input_right_pads_);
-            EXPECT_TRUE(success);
-        }
-    }
-};
-#ifdef __fp16__
-using KernelTypes =
-    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
-#else
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
-#endif
-TYPED_TEST_SUITE(TestAvgPool2dFwd, KernelTypes);
-TYPED_TEST(TestAvgPool2dFwd, Test_Pool)
-{
-    // length, window_length, window_stride, left_pad, right_pad
-    this->params = {{{1, 1, 1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}},
-                    {{2, 16, 64, 64}, {64, 64}, {1, 1}, {0, 0}, {0, 0}},
-                    {{2, 32, 30, 30}, {2, 2}, {2, 2}, {1, 1}, {1, 1}}};
-    this->Run();
-}
--- a/test/pool_fwd/test_avg_pool3d_fwd.cpp
+++ b/test/pool_fwd/test_avg_pool3d_fwd.cpp
@@ -25,6 +25,8 @@ class TestAvgPool3dFwd : public ::testing::Test
                                                      OutDataType,
                                                      ComputeDataType,
                                                      IndexDataType,
+                                                      ck::tensor_layout::convolution::NDHWC,
+                                                      ck::tensor_layout::convolution::NDHWC,
                                                      ck::ReduceTensorOp::AVG,
                                                      false,
                                                      false>(true,
@@ -34,6 +36,7 @@ class TestAvgPool3dFwd : public ::testing::Test
                                                             param.length_,
                                                             param.window_spatial_lengths_,
                                                             param.window_strides_,
+                                                             param.window_dilations_,
                                                             param.input_left_pads_,
                                                             param.input_right_pads_);
            EXPECT_TRUE(success);
@@ -49,10 +52,11 @@ using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
 TYPED_TEST_SUITE(TestAvgPool3dFwd, KernelTypes);
 TYPED_TEST(TestAvgPool3dFwd, Test_Pool)
 {
-    // length, window_length, window_stride, left_pad, right_pad
+    // length, window_length, window_stride, window_dilation, left_pad, right_pad
-    this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+    this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
-                    {{2, 16, 64, 64, 64}, {64, 64, 64}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 16, 64, 64, 64}, {64, 64, 64}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
-                    {{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}}};
+                    {{2, 16, 64, 64, 64}, {4, 4, 4}, {4, 4, 4}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}};
    this->Run();
 }
--- a/test/pool_fwd/test_max_pool2d_fwd.cpp
+++ b/test/pool_fwd/test_max_pool2d_fwd.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "profiler/profile_pool2d_fwd_impl.hpp"
-#include "test_pool_fwd_common.hpp"
-template <typename Tuple>
-class TestMaxPool2dFwd : public ::testing::Test
-{
-    protected:
-    using InDataType      = std::tuple_element_t<0, Tuple>;
-    using OutDataType     = std::tuple_element_t<1, Tuple>;
-    using ComputeDataType = std::tuple_element_t<2, Tuple>;
-    using IndexDataType   = std::tuple_element_t<3, Tuple>;
-    std::vector<PoolingParam> params;
-    void Run()
-    {
-        for(auto param : params)
-        {
-            // max pool
-            bool success =
-                ck::profiler::profile_pool2d_fwd_impl<InDataType,
-                                                      OutDataType,
-                                                      ComputeDataType,
-                                                      IndexDataType,
-                                                      ck::ReduceTensorOp::MAX,
-                                                      false,
-                                                      false>(true,
-                                                             2,
-                                                             false,
-                                                             false,
-                                                             param.length_,
-                                                             param.window_spatial_lengths_,
-                                                             param.window_strides_,
-                                                             param.input_left_pads_,
-                                                             param.input_right_pads_);
-            EXPECT_TRUE(success);
-            // max pool + index
-            success = ck::profiler::profile_pool2d_fwd_impl<InDataType,
-                                                            OutDataType,
-                                                            ComputeDataType,
-                                                            IndexDataType,
-                                                            ck::ReduceTensorOp::MAX,
-                                                            false,
-                                                            true>(true,
-                                                                  2,
-                                                                  false,
-                                                                  false,
-                                                                  param.length_,
-                                                                  param.window_spatial_lengths_,
-                                                                  param.window_strides_,
-                                                                  param.input_left_pads_,
-                                                                  param.input_right_pads_);
-            EXPECT_TRUE(success);
-        }
-    }
-};
-#ifdef __fp16__
-using KernelTypes =
-    ::testing::Types<std::tuple<F16, F16, F16, I32>, std::tuple<F32, F32, F32, I32>>;
-#else
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
-#endif
-TYPED_TEST_SUITE(TestMaxPool2dFwd, KernelTypes);
-TYPED_TEST(TestMaxPool2dFwd, Test_Pool)
-{
-    // length, window_length, window_stride, left_pad, right_pad
-    this->params = {{{1, 1, 1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}},
-                    {{2, 16, 64, 64}, {64, 64}, {1, 1}, {0, 0}, {0, 0}},
-                    {{2, 32, 30, 30}, {2, 2}, {2, 2}, {1, 1}, {1, 1}}};
-    this->Run();
-}
--- a/test/pool_fwd/test_max_pool3d_fwd.cpp
+++ b/test/pool_fwd/test_max_pool3d_fwd.cpp
@@ -26,6 +26,8 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                      OutDataType,
                                                      ComputeDataType,
                                                      IndexDataType,
+                                                      ck::tensor_layout::convolution::NDHWC,
+                                                      ck::tensor_layout::convolution::NDHWC,
                                                      ck::ReduceTensorOp::MAX,
                                                      false,
                                                      false>(true,
@@ -35,6 +37,7 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                             param.length_,
                                                             param.window_spatial_lengths_,
                                                             param.window_strides_,
+                                                             param.window_dilations_,
                                                             param.input_left_pads_,
                                                             param.input_right_pads_);
            EXPECT_TRUE(success);
@@ -44,6 +47,8 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                            OutDataType,
                                                            ComputeDataType,
                                                            IndexDataType,
+                                                            ck::tensor_layout::convolution::NDHWC,
+                                                            ck::tensor_layout::convolution::NDHWC,
                                                            ck::ReduceTensorOp::MAX,
                                                            false,
                                                            true>(true,
@@ -53,6 +58,7 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                                  param.length_,
                                                                  param.window_spatial_lengths_,
                                                                  param.window_strides_,
+                                                                  param.window_dilations_,
                                                                  param.input_left_pads_,
                                                                  param.input_right_pads_);
            EXPECT_TRUE(success);
@@ -70,10 +76,11 @@ using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
 TYPED_TEST_SUITE(TestMaxPool3dFwd, KernelTypes);
 TYPED_TEST(TestMaxPool3dFwd, Test_Pool)
 {
-    // length, window_length, window_stride, left_pad, right_pad
+    // length, window_length, window_stride, window_dilation, left_pad, right_pad
-    this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+    this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
-                    {{2, 16, 64, 64, 64}, {64, 64, 64}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 16, 64, 64, 64}, {64, 64, 64}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
-                    {{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}}};
+                    {{2, 16, 64, 64, 64}, {4, 4, 4}, {4, 4, 4}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}};
    this->Run();
 }
--- a/test/pool_fwd/test_pool_fwd_common.hpp
+++ b/test/pool_fwd/test_pool_fwd_common.hpp
@@ -14,11 +14,13 @@ struct PoolingParam
    PoolingParam(const std::vector<index_t>& length,
                 const std::vector<index_t>& window_spatial_lengths,
                 const std::vector<index_t>& window_strides,
+                 const std::vector<index_t>& window_dilations,
                 const std::vector<index_t>& input_left_pads,
                 const std::vector<index_t>& input_right_pads)
        : length_(length),
          window_spatial_lengths_(window_spatial_lengths),
          window_strides_(window_strides),
+          window_dilations_(window_dilations),
          input_left_pads_(input_left_pads),
          input_right_pads_(input_right_pads)
    {
@@ -26,6 +28,7 @@ struct PoolingParam
    std::vector<index_t> length_;
    std::vector<index_t> window_spatial_lengths_;
    std::vector<index_t> window_strides_;
+    std::vector<index_t> window_dilations_;
    std::vector<index_t> input_left_pads_;
    std::vector<index_t> input_right_pads_;
 };