Merge remote-tracking branch 'origin/develop' into ck_tile/fav3_fwd_sept

2e4e3cf6 · carlushuang · e1396d87 · a793afc9 · 2e4e3cf6 · 2e4e3cf6
Commit 2e4e3cf6 authored Sep 17, 2024 by carlushuang
19 changed files
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.7.2
+rocm-docs-core==1.8.0
 sphinxcontrib-bibtex==2.6.3
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.7.2
+rocm-docs-core==1.8.0
    # via -r requirements.in
 six==1.16.0
    # via pybtex

--- a/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -22,7 +22,7 @@ static constexpr auto WindowRank = 3;

 static constexpr auto MaxOp = ck::ReduceTensorOp::MAX;
 static constexpr auto AvgOp = ck::ReduceTensorOp::AVG;
-#ifdef CK_ENABLE_FP16
+
 // FP16
 void add_device_pool3d_fwd_ndhwc_f16_instances(
    std::vector<std::unique_ptr<
@@ -36,8 +36,22 @@ void add_device_pool3d_fwd_ndhwc_f16_instances(
 void add_device_pool3d_fwd_ndhwc_index_f16_instances(
    std::vector<std::unique_ptr<
        DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, NDHWC, NDHWC, MaxOp, true>>>&);
-#endif
-#ifdef CK_ENABLE_BF16
+
+using F8 = ck::f8_t;
+// F8
+void add_device_pool3d_fwd_ndhwc_f8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NDHWC, NDHWC, MaxOp, false>>>&);
+
+void add_device_pool3d_fwd_ndhwc_f8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NDHWC, NDHWC, AvgOp, false>>>&);
+
+// FP8 - return index
+void add_device_pool3d_fwd_ndhwc_index_f8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NDHWC, NDHWC, MaxOp, true>>>&);
+
 // BF16
 void add_device_pool3d_fwd_ndhwc_bf16_instances(
    std::vector<std::unique_ptr<
@@ -51,8 +65,7 @@ void add_device_pool3d_fwd_ndhwc_bf16_instances(
 void add_device_pool3d_fwd_ndhwc_index_bf16_instances(
    std::vector<std::unique_ptr<
        DevicePoolFwd<InOutRank, WindowRank, BF16, BF16, I32, NDHWC, NDHWC, MaxOp, true>>>&);
-#endif
-#ifdef CK_ENABLE_FP32
+
 // FP32
 void add_device_pool3d_fwd_ndhwc_f32_instances(
    std::vector<std::unique_ptr<
@@ -66,7 +79,21 @@ void add_device_pool3d_fwd_ndhwc_f32_instances(
 void add_device_pool3d_fwd_ndhwc_index_f32_instances(
    std::vector<std::unique_ptr<
        DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, NDHWC, NDHWC, MaxOp, true>>>&);
-#endif
+
+// I8
+void add_device_pool3d_fwd_ndhwc_i8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NDHWC, NDHWC, MaxOp, false>>>&);
+
+void add_device_pool3d_fwd_ndhwc_i8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NDHWC, NDHWC, AvgOp, false>>>&);
+
+// I8 - return index
+void add_device_pool3d_fwd_ndhwc_index_i8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NDHWC, NDHWC, MaxOp, true>>>&);
+
 template <typename InDataType,
          typename OutDataType,
          typename IndexDataType,
@@ -99,7 +126,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
        if constexpr(is_same_v<InLayout, NDHWC> && is_same_v<OutLayout, NDHWC>)
        {
-#ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, F16> && is_same_v<OutDataType, F16> &&
                         is_same_v<IndexDataType, I32>)
            {
@@ -112,8 +138,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
                    add_device_pool3d_fwd_ndhwc_f16_instances(op_ptrs);
                }
            }
-#endif
-#ifdef CK_ENABLE_BF16
            else if constexpr(is_same_v<InDataType, BF16> && is_same_v<OutDataType, BF16> &&
                              is_same_v<IndexDataType, I32>)
            {
@@ -126,8 +150,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
                    add_device_pool3d_fwd_ndhwc_bf16_instances(op_ptrs);
                }
            }
-#endif
-#ifdef CK_ENABLE_FP32
            else if constexpr(is_same_v<InDataType, F32> && is_same_v<OutDataType, F32> &&
                              is_same_v<IndexDataType, I32>)
            {
@@ -140,7 +162,30 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
                    add_device_pool3d_fwd_ndhwc_f32_instances(op_ptrs);
                }
            }
-#endif
+            else if constexpr(is_same_v<InDataType, F8> && is_same_v<OutDataType, F8> &&
+                              is_same_v<IndexDataType, I32>)
+            {
+                if constexpr(OutputIndex && ReduceOpId == MaxOp)
+                {
+                    add_device_pool3d_fwd_ndhwc_index_f8_instances(op_ptrs);
+                }
+                else
+                {
+                    add_device_pool3d_fwd_ndhwc_f8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(is_same_v<InDataType, I8> && is_same_v<OutDataType, I8> &&
+                              is_same_v<IndexDataType, I32>)
+            {
+                if constexpr(OutputIndex && ReduceOpId == MaxOp)
+                {
+                    add_device_pool3d_fwd_ndhwc_index_i8_instances(op_ptrs);
+                }
+                else
+                {
+                    add_device_pool3d_fwd_ndhwc_i8_instances(op_ptrs);
+                }
+            }
        }

        return op_ptrs;

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
 set(DEVICE_POOL3D_FWD_INSTANCES)
 list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
                                        device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+                                        device_max_pool3d_fwd_ndhwc_f8_instance.cpp
+                                        device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
+                                        device_max_pool3d_fwd_ndhwc_i8_instance.cpp
+                                        device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
                                        device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
                                        device_max_pool3d_fwd_ndhwc_f32_instance.cpp
                                        device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "pool_fwd_instance_common.hpp"


--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool3d_fwd_ndhwc_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool3d_fwd_ndhwc_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "pool_fwd_instance_common.hpp"


--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "pool_fwd_instance_common.hpp"


--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool3d_fwd_ndhwc_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, false>{});
+}
+
+void add_device_pool3d_fwd_ndhwc_index_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool3d_fwd_ndhwc_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I8, ReduceOpId, false>{});
+}
+
+void add_device_pool3d_fwd_ndhwc_index_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I8, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -15,6 +15,8 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

+using I8    = int8_t;
+using F8    = ck::f8_t;
 using I32   = int32_t;
 using F16   = ck::half_t;
 using BF16  = ck::bhalf_t;

--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -17,6 +17,26 @@
 namespace ck {
 namespace profiler {

+struct PoolFwdInputParams
+{
+    int do_verification;
+    int init_method;
+    bool do_log;
+    bool time_kernel;
+    bool return_index;
+    int reduce_op;
+};
+
+struct PoolFwdKernelParams
+{
+    std::vector<index_t> in_length; // NCDHW
+    std::vector<index_t> window_spatial_lengths;
+    std::vector<index_t> window_strides;
+    std::vector<index_t> window_dilations;
+    std::vector<index_t> input_left_pads;
+    std::vector<index_t> input_right_pads;
+};
+
 template <typename InDataType,
          typename OutDataType,
          typename ComputeDataType,
@@ -26,29 +46,23 @@ template <typename InDataType,
          ck::ReduceTensorOp ReduceOpId,
          bool PropagateNan,
          bool OutputIndex>
-bool profile_pool3d_fwd_impl(int do_verification,
-                             int init_method,
-                             bool do_log,
-                             bool time_kernel,
-                             std::vector<index_t> in_length, // NCDHW
-                             std::vector<index_t> window_spatial_lengths,
-                             std::vector<index_t> window_strides,
-                             std::vector<index_t> window_dilations,
-                             std::vector<index_t> input_left_pads,
-                             std::vector<index_t> input_right_pads)
+bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& kernel_params)
 {
    constexpr index_t InOutRank  = 5;
    constexpr index_t WindowRank = 3;

-    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
-       window_strides.size() != WindowRank || window_dilations.size() != WindowRank ||
-       input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
+    if(kernel_params.in_length.size() != InOutRank ||
+       kernel_params.window_spatial_lengths.size() != WindowRank ||
+       kernel_params.window_strides.size() != WindowRank ||
+       kernel_params.window_dilations.size() != WindowRank ||
+       kernel_params.input_left_pads.size() != WindowRank ||
+       kernel_params.input_right_pads.size() != WindowRank)
        return false;

    std::vector<index_t> out_length(InOutRank);

-    int N = in_length[0];
-    int C = in_length[1];
+    int N = kernel_params.in_length[0];
+    int C = kernel_params.in_length[1];

    out_length[0] = N;
    out_length[1] = C;
@@ -56,18 +70,18 @@ bool profile_pool3d_fwd_impl(int do_verification,
    // Calculate Do, Ho, Wo
    for(int i = 2; i < InOutRank; ++i)
    {
-        auto pad1             = input_left_pads[i - 2];
-        auto pad2             = input_right_pads[i - 2];
-        auto windows_size     = window_spatial_lengths[i - 2];
-        auto windows_stride   = window_strides[i - 2];
-        auto windows_dilation = window_dilations[i - 2];
+        auto pad1             = kernel_params.input_left_pads[i - 2];
+        auto pad2             = kernel_params.input_right_pads[i - 2];
+        auto windows_size     = kernel_params.window_spatial_lengths[i - 2];
+        auto windows_stride   = kernel_params.window_strides[i - 2];
+        auto windows_dilation = kernel_params.window_dilations[i - 2];
        auto eff              = (windows_size - 1) * windows_dilation + 1;
-        out_length[i]         = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
+        out_length[i] = (kernel_params.in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
    }

-    int Di = in_length[2];
-    int Hi = in_length[3];
-    int Wi = in_length[4];
+    int Di = kernel_params.in_length[2];
+    int Hi = kernel_params.in_length[3];
+    int Wi = kernel_params.in_length[4];
    int Do = out_length[2];
    int Ho = out_length[3];
    int Wo = out_length[4];
@@ -88,7 +102,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
    Tensor<IndexDataType> out_indices_n_c_do_ho_wo_device(
        f_host_tensor_descriptor(N, C, Do, Ho, Wo));

-    switch(init_method)
+    switch(in_params.init_method)
    {
    case 0: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
    case 1: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
@@ -125,7 +139,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
    float best_avg_time   = std::numeric_limits<float>::max();
    float best_gb_per_sec = 0;

-    if(do_verification)
+    if(in_params.do_verification)
    {
        using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
                                                                                  WindowRank,
@@ -141,11 +155,11 @@ bool profile_pool3d_fwd_impl(int do_verification,
        auto ref_argument = ref.MakeArgument(in_n_c_di_hi_wi,
                                             out_n_c_do_ho_wo_host,
                                             out_indices_n_c_do_ho_wo_host,
-                                             window_spatial_lengths,
-                                             window_strides,
-                                             window_dilations,
-                                             input_left_pads,
-                                             input_right_pads);
+                                             kernel_params.window_spatial_lengths,
+                                             kernel_params.window_strides,
+                                             kernel_params.window_dilations,
+                                             kernel_params.input_left_pads,
+                                             kernel_params.input_right_pads);
        auto ref_invoker  = ref.MakeInvoker();
        ref_invoker.Run(ref_argument);
    }
@@ -158,16 +172,16 @@ bool profile_pool3d_fwd_impl(int do_verification,
            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
-            in_length,
-            window_spatial_lengths,
+            kernel_params.in_length,
+            kernel_params.window_spatial_lengths,
            out_length,
            {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C},
            {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
            {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
-            window_strides,
-            window_dilations,
-            input_left_pads,
-            input_right_pads,
+            kernel_params.window_strides,
+            kernel_params.window_dilations,
+            kernel_params.input_left_pads,
+            kernel_params.input_right_pads,
            {2, 3, 4});

        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
@@ -176,10 +190,11 @@ bool profile_pool3d_fwd_impl(int do_verification,
        }
        else
        {
-            if(time_kernel)
+            if(in_params.time_kernel)
            {
                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
-                LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl;
+                LogRange(std::cout << "input lengths = ", kernel_params.in_length, ", ")
+                    << std::endl;
            }

            continue;
@@ -187,7 +202,8 @@ bool profile_pool3d_fwd_impl(int do_verification,

        auto invoker_ptr = inst_ptr->MakeInvokerPointer();

-        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        float avg_time =
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, in_params.time_kernel});

        std::size_t num_bytes = in_n_c_di_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) +
                                out_n_c_do_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType);
@@ -198,7 +214,7 @@ bool profile_pool3d_fwd_impl(int do_verification,

        float gb_per_sec = num_bytes / 1.E6 / avg_time;

-        if(time_kernel)
+        if(in_params.time_kernel)
            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
                      << inst_ptr->GetTypeString() << std::endl;

@@ -209,25 +225,25 @@ bool profile_pool3d_fwd_impl(int do_verification,
            best_gb_per_sec    = gb_per_sec;
        }

-        if(do_verification)
+        if(in_params.do_verification)
        {
            out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());

-            bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
+            auto tolerance = 1e-3;
+            bool pass      = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
                                             out_n_c_do_ho_wo_host.mData,
                                             "Error: Incorrect results",
-                                             1e-3,
-                                             1e-3);
+                                             tolerance,
+                                             tolerance);

            if constexpr(OutputIndex)
            {
                out_indices_device_buf.FromDevice(out_indices_n_c_do_ho_wo_device.mData.data());
-
                pass = pass && ck::utils::check_err(out_indices_n_c_do_ho_wo_device,
                                                    out_indices_n_c_do_ho_wo_host);
            }

-            if(do_log)
+            if(in_params.do_log)
            {
                LogRangeAsType<float>(
                    std::cout << "in_n_c_di_hi_wi  : ", in_n_c_di_hi_wi.mData, ",")
@@ -249,20 +265,21 @@ bool profile_pool3d_fwd_impl(int do_verification,
            if(!pass)
            {
                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
-                LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl;
+                LogRange(std::cout << "lengths = [", kernel_params.in_length, ", ")
+                    << "]." << std::endl;
                return false;
            }
            else
            {
-                if(time_kernel)
+                if(in_params.time_kernel)
                    std::cout << "pass" << std::endl;
            }
        }
    }

-    if(time_kernel)
+    if(in_params.time_kernel)
    {
-        LogRange(std::cout << "length = ", in_length, ",") << std::endl;
+        LogRange(std::cout << "length = ", kernel_params.in_length, ",") << std::endl;
        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
                  << best_instance_name << std::endl;
    }

--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -10,7 +10,7 @@ set(PROFILER_SOURCES
    profile_groupnorm_bwd_gamma_beta.cpp
    profile_layernorm_fwd.cpp
    profile_max_pool2d_fwd.cpp
-    profile_max_pool3d_fwd.cpp
+    profile_pool3d_fwd.cpp
    profile_avg_pool3d_bwd.cpp
    profile_max_pool3d_bwd.cpp
    profile_avg_pool2d_bwd.cpp

--- a/profiler/src/profile_max_pool3d_fwd.cpp
+++ b/profiler/src/profile_max_pool3d_fwd.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <vector>
-#include <unordered_map>
-
-#include "profiler/data_type_enum.hpp"
-#include "profiler/profile_pool3d_fwd_impl.hpp"
-#include "profiler_operation_registry.hpp"
-
-using ck::index_t;
-
-struct maxPoolFwdArgParser
-{
-    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
-                                                                   {"wsize", {}},
-                                                                   {"wstride", {}},
-                                                                   {"wdilation", {}},
-                                                                   {"pad1", {}},
-                                                                   {"pad2", {}}};
-
-    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
-    {
-        if(std::string("--") + key == argv[i])
-        {
-            int pos = i;
-            while(++i < argc && argv[i][0] != '-') {}
-            int end = i;
-            for(int j = pos + 1; j < end; j++)
-            {
-                long_opts[key].push_back(std::stoi(argv[j]));
-            }
-            return true;
-        }
-        return false;
-    }
-
-    void operator()(int argc, char* argv[])
-    {
-        for(auto& kv : long_opts)
-        {
-            for(int i = 1; i < argc; i++)
-            {
-                if(parse_opt(argc, argv, kv.first, i))
-                    break;
-            }
-        }
-    }
-};
-
-void print_help_max_pool3d_fwd()
-{
-    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
-              << "arg2: verification (0: no; 1: yes)\n"
-              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
-              << "arg4: print tensor value (0: no; 1: yes)\n"
-              << "arg5: time kernel (0=no, 1=yes)\n"
-              << "arg6: return index (0=no, 1=yes)\n"
-              << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
-              << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
-              << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
-              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
-              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
-              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
-              << "eg: ckProfiler max_pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
-                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
-              << std::endl;
-}
-
-int profile_max_pool3d_fwd(int argc, char* argv[])
-{
-    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
-    bool do_verification       = true;
-    int init_method            = 0;
-    bool do_log                = false;
-    bool time_kernel           = true;
-    bool return_index          = false;
-
-    std::vector<index_t> in_length = {2, 32, 30, 30, 30};
-    std::vector<index_t> wsize     = {2, 2, 2};
-    std::vector<index_t> wstride   = {2, 2, 2};
-    std::vector<index_t> wdilation = {1, 1, 1};
-    std::vector<index_t> pad1      = {1, 1, 1};
-    std::vector<index_t> pad2      = {1, 1, 1};
-
-    if(argc != 2 && argc != 34)
-    {
-        print_help_max_pool3d_fwd();
-        return 0;
-    }
-    else if(argc == 34)
-    {
-        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
-        do_verification = std::stoi(argv[3]);
-        init_method     = std::stoi(argv[4]);
-        do_log          = std::stoi(argv[5]);
-        time_kernel     = std::stoi(argv[6]);
-        return_index    = std::stoi(argv[7]);
-
-        // parse the long options
-        maxPoolFwdArgParser arg_parser;
-        arg_parser(argc, argv);
-        in_length = arg_parser.long_opts["length"];
-        wsize     = arg_parser.long_opts["wsize"];
-        wstride   = arg_parser.long_opts["wstride"];
-        wdilation = arg_parser.long_opts["wdilation"];
-        pad1      = arg_parser.long_opts["pad1"];
-        pad2      = arg_parser.long_opts["pad2"];
-    }
-
-#ifdef CK_ENABLE_FP16
-    using F16 = ck::half_t;
-#endif
-#ifdef CK_ENABLE_BF16
-    using BF16 = ck::bhalf_t;
-#endif
-#ifdef CK_ENABLE_FP32
-    using F32 = float;
-#endif
-    using I32   = int32_t;
-    using NDHWC = ck::tensor_layout::convolution::NDHWC;
-
-#if 1
-    constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
-#else
-    constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
-#endif
-
-    if(false)
-        ;
-#ifdef CK_ENABLE_FP16
-    else if(data_type == ck::DataTypeEnum::Half)
-    {
-        if(return_index)
-            ck::profiler::
-                profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
-                    do_verification,
-                    init_method,
-                    do_log,
-                    time_kernel,
-                    in_length,
-                    wsize,
-                    wstride,
-                    wdilation,
-                    pad1,
-                    pad2);
-        else
-            ck::profiler::
-                profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
-                    do_verification,
-                    init_method,
-                    do_log,
-                    time_kernel,
-                    in_length,
-                    wsize,
-                    wstride,
-                    wdilation,
-                    pad1,
-                    pad2);
-    }
-#endif
-#ifdef CK_ENABLE_BF16
-    else if(data_type == ck::DataTypeEnum::BFloat16)
-    {
-        if(return_index)
-            ck::profiler::profile_pool3d_fwd_impl<BF16,
-                                                  BF16,
-                                                  BF16,
-                                                  I32,
-                                                  NDHWC,
-                                                  NDHWC,
-                                                  ReduceOpId,
-                                                  false,
-                                                  true>(do_verification,
-                                                        init_method,
-                                                        do_log,
-                                                        time_kernel,
-                                                        in_length,
-                                                        wsize,
-                                                        wstride,
-                                                        wdilation,
-                                                        pad1,
-                                                        pad2);
-        else
-            ck::profiler::profile_pool3d_fwd_impl<BF16,
-                                                  BF16,
-                                                  BF16,
-                                                  I32,
-                                                  NDHWC,
-                                                  NDHWC,
-                                                  ReduceOpId,
-                                                  false,
-                                                  false>(do_verification,
-                                                         init_method,
-                                                         do_log,
-                                                         time_kernel,
-                                                         in_length,
-                                                         wsize,
-                                                         wstride,
-                                                         wdilation,
-                                                         pad1,
-                                                         pad2);
-    }
-#endif
-#ifdef CK_ENABLE_FP32
-    else if(data_type == ck::DataTypeEnum::Float)
-    {
-        if(return_index)
-            ck::profiler::
-                profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
-                    do_verification,
-                    init_method,
-                    do_log,
-                    time_kernel,
-                    in_length,
-                    wsize,
-                    wstride,
-                    wdilation,
-                    pad1,
-                    pad2);
-        else
-            ck::profiler::
-                profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
-                    do_verification,
-                    init_method,
-                    do_log,
-                    time_kernel,
-                    in_length,
-                    wsize,
-                    wstride,
-                    wdilation,
-                    pad1,
-                    pad2);
-    }
-#endif
-    else
-    {
-        throw std::runtime_error("not implemented yet");
-    }
-
-    return 0;
-}
-
-REGISTER_PROFILER_OPERATION("max_pool3d_fwd", "max_pool3d fwd", profile_max_pool3d_fwd);
--- a/profiler/src/profile_pool3d_fwd.cpp
+++ b/profiler/src/profile_pool3d_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_pool3d_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+struct poolFwdArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
+                                                                   {"wsize", {}},
+                                                                   {"wstride", {}},
+                                                                   {"wdilation", {}},
+                                                                   {"pad1", {}},
+                                                                   {"pad2", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help_pool3d_fwd()
+{
+    std::cout << "arg1: data type (0: fp16; 1: fp32; 3: int8; 5: bf16; 7: fp8)\n"
+              << "arg2: verification (0: no; 1: yes)\n"
+              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg4: print tensor value (0: no; 1: yes)\n"
+              << "arg5: time kernel (0=no, 1=yes)\n"
+              << "arg6: return index (0=no, 1=yes)\n"
+              << "arg7: reduce op (0: max; 1: avg)\n"
+              << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
+              << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
+              << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
+              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
+              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
+              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
+              << "eg: ckProfiler pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
+                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
+              << std::endl;
+}
+
+int profile_pool3d_fwd(int argc, char* argv[])
+{
+    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
+    ck::profiler::PoolFwdInputParams in_params{true, 0, false, true, false, 0};
+    ck::profiler::PoolFwdKernelParams kernel_params{
+        {2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+
+    if(argc != 2 && argc != 35)
+    {
+        print_help_pool3d_fwd();
+        return 0;
+    }
+    else if(argc == 35)
+    {
+        data_type                 = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+        in_params.do_verification = std::stoi(argv[3]);
+        in_params.init_method     = std::stoi(argv[4]);
+        in_params.do_log          = std::stoi(argv[5]);
+        in_params.time_kernel     = std::stoi(argv[6]);
+        in_params.return_index    = std::stoi(argv[7]);
+        in_params.reduce_op       = std::stoi(argv[8]);
+
+        // parse the long options
+        poolFwdArgParser arg_parser;
+        arg_parser(argc, argv);
+        kernel_params.in_length              = arg_parser.long_opts["length"];
+        kernel_params.window_spatial_lengths = arg_parser.long_opts["wsize"];
+        kernel_params.window_strides         = arg_parser.long_opts["wstride"];
+        kernel_params.window_dilations       = arg_parser.long_opts["wdilation"];
+        kernel_params.input_left_pads        = arg_parser.long_opts["pad1"];
+        kernel_params.input_right_pads       = arg_parser.long_opts["pad2"];
+    }
+
+    using F16   = ck::half_t;
+    using BF16  = ck::bhalf_t;
+    using F32   = float;
+    using I8    = int8_t;
+    using I32   = int32_t;
+    using F8    = ck::f8_t;
+    using NDHWC = ck::tensor_layout::convolution::NDHWC;
+
+    if(data_type == ck::DataTypeEnum::Half)
+    {
+        if(in_params.reduce_op == 1)
+        {
+            ck::profiler::profile_pool3d_fwd_impl<F16,
+                                                  F16,
+                                                  F32,
+                                                  I32,
+                                                  NDHWC,
+                                                  NDHWC,
+                                                  ck::ReduceTensorOp::AVG,
+                                                  false,
+                                                  false>(in_params, kernel_params);
+        }
+        else
+        { // reduce_op == 0
+            if(in_params.return_index)
+            {
+                ck::profiler::profile_pool3d_fwd_impl<F16,
+                                                      F16,
+                                                      F16,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      true>(in_params, kernel_params);
+            }
+            else
+            {
+                ck::profiler::profile_pool3d_fwd_impl<F16,
+                                                      F16,
+                                                      F16,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      false>(in_params, kernel_params);
+            }
+        }
+    }
+    else if(data_type == ck::DataTypeEnum::BFloat16)
+    {
+        if(in_params.reduce_op == 1)
+        {
+            ck::profiler::profile_pool3d_fwd_impl<BF16,
+                                                  BF16,
+                                                  F32,
+                                                  I32,
+                                                  NDHWC,
+                                                  NDHWC,
+                                                  ck::ReduceTensorOp::AVG,
+                                                  false,
+                                                  false>(in_params, kernel_params);
+        }
+        else
+        { // reduce_op == 0
+            if(in_params.return_index)
+            {
+                ck::profiler::profile_pool3d_fwd_impl<BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      true>(in_params, kernel_params);
+            }
+            else
+            {
+                ck::profiler::profile_pool3d_fwd_impl<BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      false>(in_params, kernel_params);
+            }
+        }
+    }
+    else if(data_type == ck::DataTypeEnum::Float)
+    {
+        if(in_params.reduce_op == 1)
+        {
+            ck::profiler::profile_pool3d_fwd_impl<F32,
+                                                  F32,
+                                                  F32,
+                                                  I32,
+                                                  NDHWC,
+                                                  NDHWC,
+                                                  ck::ReduceTensorOp::AVG,
+                                                  false,
+                                                  false>(in_params, kernel_params);
+        }
+        else
+        { // reduce_op == 0
+            if(in_params.return_index)
+            {
+                ck::profiler::profile_pool3d_fwd_impl<F32,
+                                                      F32,
+                                                      F32,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      true>(in_params, kernel_params);
+            }
+            else
+            {
+                ck::profiler::profile_pool3d_fwd_impl<F32,
+                                                      F32,
+                                                      F32,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      false>(in_params, kernel_params);
+            }
+        }
+    }
+    else if(data_type == ck::DataTypeEnum::Float8)
+    {
+        if(in_params.reduce_op == 1)
+        {
+            return ck::profiler::profile_pool3d_fwd_impl<F8,
+                                                         F8,
+                                                         F32,
+                                                         I32,
+                                                         NDHWC,
+                                                         NDHWC,
+                                                         ck::ReduceTensorOp::AVG,
+                                                         false,
+                                                         false>(in_params, kernel_params);
+        }
+        else
+        { // reduce_op == 0
+            if(in_params.return_index)
+            {
+                return ck::profiler::profile_pool3d_fwd_impl<F8,
+                                                             F8,
+                                                             F8,
+                                                             I32,
+                                                             NDHWC,
+                                                             NDHWC,
+                                                             ck::ReduceTensorOp::MAX,
+                                                             false,
+                                                             true>(in_params, kernel_params);
+            }
+            else
+            {
+                return ck::profiler::profile_pool3d_fwd_impl<F8,
+                                                             F8,
+                                                             F8,
+                                                             I32,
+                                                             NDHWC,
+                                                             NDHWC,
+                                                             ck::ReduceTensorOp::MAX,
+                                                             false,
+                                                             false>(in_params, kernel_params);
+            }
+        }
+    }
+    else if(data_type == ck::DataTypeEnum::Int8)
+    {
+        if(in_params.reduce_op == 1)
+        {
+            return ck::profiler::profile_pool3d_fwd_impl<I8,
+                                                         I8,
+                                                         I32,
+                                                         I32,
+                                                         NDHWC,
+                                                         NDHWC,
+                                                         ck::ReduceTensorOp::AVG,
+                                                         false,
+                                                         false>(in_params, kernel_params);
+        }
+        else
+        { // reduce_op == 0
+            if(in_params.return_index)
+            {
+                return ck::profiler::profile_pool3d_fwd_impl<I8,
+                                                             I8,
+                                                             I8,
+                                                             I32,
+                                                             NDHWC,
+                                                             NDHWC,
+                                                             ck::ReduceTensorOp::MAX,
+                                                             false,
+                                                             true>(in_params, kernel_params);
+            }
+            else
+            {
+                return ck::profiler::profile_pool3d_fwd_impl<I8,
+                                                             I8,
+                                                             I8,
+                                                             I32,
+                                                             NDHWC,
+                                                             NDHWC,
+                                                             ck::ReduceTensorOp::MAX,
+                                                             false,
+                                                             false>(in_params, kernel_params);
+            }
+        }
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION("pool3d_fwd", "pool3d fwd", profile_pool3d_fwd);
--- a/test/pool/test_avg_pool3d_fwd.cpp
+++ b/test/pool/test_avg_pool3d_fwd.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
 #include "profiler/profile_pool3d_fwd_impl.hpp"
@@ -16,10 +16,19 @@ class TestAvgPool3dFwd : public ::testing::Test

    std::vector<PoolingParam> params;

+    ck::profiler::PoolFwdInputParams in_params_avg_pool{true, 2, false, false, false, 1};
+
    void Run()
    {
        for(auto param : params)
        {
+            ck::profiler::PoolFwdKernelParams kernel_params{param.length_,
+                                                            param.window_spatial_lengths_,
+                                                            param.window_strides_,
+                                                            param.window_dilations_,
+                                                            param.input_left_pads_,
+                                                            param.input_right_pads_};
+
            bool success =
                ck::profiler::profile_pool3d_fwd_impl<InDataType,
                                                      OutDataType,
@@ -29,26 +38,18 @@ class TestAvgPool3dFwd : public ::testing::Test
                                                      ck::tensor_layout::convolution::NDHWC,
                                                      ck::ReduceTensorOp::AVG,
                                                      false,
-                                                      false>(true,
-                                                             2,
-                                                             false,
-                                                             false,
-                                                             param.length_,
-                                                             param.window_spatial_lengths_,
-                                                             param.window_strides_,
-                                                             param.window_dilations_,
-                                                             param.input_left_pads_,
-                                                             param.input_right_pads_);
+                                                      false>(in_params_avg_pool, kernel_params);
            EXPECT_TRUE(success);
        }
    }
 };
-#ifdef CK_ENABLE_FP16
-using KernelTypes =
-    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
-#else
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
-#endif
+
+using KernelTypes = ::testing::Types<std::tuple<I8, I8, I32, I32>,
+                                     std::tuple<F8, F8, F32, I32>,
+                                     std::tuple<F16, F16, F32, I32>,
+                                     std::tuple<BF16, BF16, F32, I32>,
+                                     std::tuple<F32, F32, F32, I32>>;
+
 TYPED_TEST_SUITE(TestAvgPool3dFwd, KernelTypes);
 TYPED_TEST(TestAvgPool3dFwd, Test_Pool)
 {

--- a/test/pool/test_max_pool3d_fwd.cpp
+++ b/test/pool/test_max_pool3d_fwd.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
 #include "profiler/profile_pool3d_fwd_impl.hpp"
@@ -16,10 +16,20 @@ class TestMaxPool3dFwd : public ::testing::Test

    std::vector<PoolingParam> params;

+    ck::profiler::PoolFwdInputParams in_params_max_pool{true, 2, false, false, false, 0};
+    ck::profiler::PoolFwdInputParams in_params_max_pool_indexed{true, 2, false, false, true, 0};
+
    void Run()
    {
        for(auto param : params)
        {
+            ck::profiler::PoolFwdKernelParams kernel_params{param.length_,
+                                                            param.window_spatial_lengths_,
+                                                            param.window_strides_,
+                                                            param.window_dilations_,
+                                                            param.input_left_pads_,
+                                                            param.input_right_pads_};
+
            // max pool
            bool success =
                ck::profiler::profile_pool3d_fwd_impl<InDataType,
@@ -30,16 +40,7 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                      ck::tensor_layout::convolution::NDHWC,
                                                      ck::ReduceTensorOp::MAX,
                                                      false,
-                                                      false>(true,
-                                                             2,
-                                                             false,
-                                                             false,
-                                                             param.length_,
-                                                             param.window_spatial_lengths_,
-                                                             param.window_strides_,
-                                                             param.window_dilations_,
-                                                             param.input_left_pads_,
-                                                             param.input_right_pads_);
+                                                      false>(in_params_max_pool, kernel_params);
            EXPECT_TRUE(success);

            // max pool + index
@@ -51,27 +52,18 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                            ck::tensor_layout::convolution::NDHWC,
                                                            ck::ReduceTensorOp::MAX,
                                                            false,
-                                                            true>(true,
-                                                                  2,
-                                                                  false,
-                                                                  false,
-                                                                  param.length_,
-                                                                  param.window_spatial_lengths_,
-                                                                  param.window_strides_,
-                                                                  param.window_dilations_,
-                                                                  param.input_left_pads_,
-                                                                  param.input_right_pads_);
+                                                            true>(in_params_max_pool_indexed,
+                                                                  kernel_params);
            EXPECT_TRUE(success);
        }
    }
 };

-#ifdef CK_ENABLE_FP16
-using KernelTypes =
-    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
-#else
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
-#endif
+using KernelTypes = ::testing::Types<std::tuple<I8, I8, I8, I32>,
+                                     std::tuple<F8, F8, F8, I32>,
+                                     std::tuple<F16, F16, F16, I32>,
+                                     std::tuple<BF16, BF16, BF16, I32>,
+                                     std::tuple<F32, F32, F32, I32>>;

 TYPED_TEST_SUITE(TestMaxPool3dFwd, KernelTypes);
 TYPED_TEST(TestMaxPool3dFwd, Test_Pool)

--- a/test/pool/test_pool_fwd_common.hpp
+++ b/test/pool/test_pool_fwd_common.hpp
@@ -4,6 +4,8 @@
 #include "gtest/gtest.h"
 #include "ck/ck.hpp"

+using I8   = int8_t;
+using F8   = ck::f8_t;
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using F32  = float;