Merge branch 'amd-develop' into amd-master

b924e330 · Jun Liu · 72c9f129 · 9c0811f3 · b924e330 · b924e330
Commit b924e330 authored Oct 03, 2024 by Jun Liu
20 changed files
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                ConvFwdDefault>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt
 set(DEVICE_MAXPOOL_BWD_INSTANCES)
 list(APPEND DEVICE_MAXPOOL_BWD_INSTANCES device_max_pool_bwd_f16_instance.cpp
                                         device_max_pool_bwd_bf16_instance.cpp
-                                         device_max_pool_bwd_f32_instance.cpp)
+                                         device_max_pool_bwd_f32_instance.cpp
+                                         device_max_pool_bwd_f8_instance.cpp
+                                         device_max_pool_bwd_int8_instance.cpp)
 add_instance_library(device_max_pool_bwd_instance ${DEVICE_MAXPOOL_BWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "max_pool_bwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_maxpool_bwd_f8_instances(
+    std::vector<std::unique_ptr<DeviceMaxPoolBwd<F8, I32, F8>>>& instances)
+{
+    add_device_operation_instances(instances, device_maxpool_bwd_instances<F8, I32, F8>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_int8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "max_pool_bwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_maxpool_bwd_int8_instances(
+    std::vector<std::unique_ptr<DeviceMaxPoolBwd<I8, I32, I8>>>& instances)
+{
+    add_device_operation_instances(instances, device_maxpool_bwd_instances<I8, I32, I8>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/max_pool_bwd/max_pool_bwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/max_pool_bwd/max_pool_bwd_instance_common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -17,6 +17,8 @@ namespace instance {
 using I32  = int32_t;
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F8   = ck::f8_t;
 using F32  = float;

 template <typename DOutDataType, typename IndexDataType, typename DInDataType>

--- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
@@ -27,23 +27,38 @@ rocm_install(FILES ${MHA_HEADERS} DESTINATION include/ck_tile/ops)
 # headers for building lib
 file(COPY ${MHA_HEADERS} DESTINATION ${FMHA_CPP_FOLDER})

+# Delete the blob file if it exists to avoid append of old content.
+if(EXISTS ${FMHA_CPP_FOLDER}/blob_list.txt)
+    file(REMOVE ${FMHA_CPP_FOLDER}/blob_list.txt)
+endif()
+
+set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd")
+
 # generate a list of kernels, but not actually emit files at config stage
+# Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time.
+# With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad.
 execute_process(
-  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
+  COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py
  --list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt
+  --api ${FMHA_KNOWN_APIS}
+  --receipt 3
  RESULT_VARIABLE ret
 )
 if(ret AND NOT ret EQUAL 0)
  message( FATAL_ERROR "CK Tile MHA FAILED to genrate a list of kernels via Python.")
 else()
-  file(STRINGS ${FMHA_CPP_FOLDER}/blob_list.txt FMHA_FWD_GEN_BLOBS)
+  file(STRINGS ${FMHA_CPP_FOLDER}/blob_list.txt FMHA_GEN_BLOBS)
 endif()

-# actually generate the cpp files
+# actually generate the kernel content now
+# Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time.
+# With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad.
 add_custom_command(
-  OUTPUT ${FMHA_FWD_GEN_BLOBS}
-  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
+  OUTPUT ${FMHA_GEN_BLOBS}
+  COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py
  --output_dir ${FMHA_CPP_FOLDER}
+  --api ${FMHA_KNOWN_APIS}
+  --receipt 3
  COMMENT "Generating mha kernel (cpp) files now ..."
  VERBATIM
 )
@@ -52,12 +67,12 @@ add_custom_command(
 # have filename. Since, it was cauing the cmake
 # to throw "File name too long"
 set(device_files)
-foreach(filepath IN LISTS FMHA_FWD_GEN_BLOBS)
+foreach(filepath IN LISTS FMHA_GEN_BLOBS)
    get_filename_component(filename ${filepath} NAME)
    # Append the filename to the device_files list
    list(APPEND device_files ${filename})
 endforeach()
-add_custom_target(generate_cpp_files DEPENDS ${FMHA_FWD_GEN_BLOBS})
+add_custom_target(generate_cpp_files DEPENDS ${FMHA_GEN_BLOBS})

 add_instance_library(device_mha_instance ${device_files})


--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/CMakeLists.txt
+set(DEVICE_POOL2D_FWD_INSTANCES)
+list(APPEND DEVICE_POOL2D_FWD_INSTANCES device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+                                        device_max_pool2d_fwd_nhwc_f16_instance.cpp
+                                        device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+                                        device_max_pool2d_fwd_nhwc_f32_instance.cpp
+                                        device_avg_pool2d_fwd_nhwc_bf16_instance.cpp
+                                        device_max_pool2d_fwd_nhwc_bf16_instance.cpp
+                                        device_avg_pool2d_fwd_nhwc_i8_instance.cpp
+                                        device_max_pool2d_fwd_nhwc_i8_instance.cpp
+                                        device_avg_pool2d_fwd_nhwc_f8_instance.cpp
+                                        device_max_pool2d_fwd_nhwc_f8_instance.cpp)
+add_instance_library(device_pool2d_fwd_instance ${DEVICE_POOL2D_FWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool2d_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool2d_fwd_nhwc_bf16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<4, 2, BF16, BF16, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<BF16, BF16, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool2d_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool2d_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool2d_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool2d_fwd_nhwc_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F8, F8, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F8, F8, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_i8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool2d_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool2d_fwd_nhwc_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, I8, I8, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<I8, I8, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool2d_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool2d_fwd_nhwc_bf16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<4, 2, BF16, BF16, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<BF16, BF16, I32, F32, ReduceOpId, false>{});
+}
+
+void add_device_pool2d_fwd_nhwc_index_bf16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<4, 2, BF16, BF16, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<BF16, BF16, I32, F32, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool2d_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
+}
+
+void add_device_pool2d_fwd_nhwc_index_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool2d_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+
+void add_device_pool2d_fwd_nhwc_index_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool2d_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool2d_fwd_nhwc_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F8, F8, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F8, F8, I32, F32, ReduceOpId, false>{});
+}
+
+void add_device_pool2d_fwd_nhwc_index_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F8, F8, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F8, F8, I32, F32, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_i8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool2d_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool2d_fwd_nhwc_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, I8, I8, I32, NHWC, NHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<I8, I8, I32, F32, ReduceOpId, false>{});
+}
+
+void add_device_pool2d_fwd_nhwc_index_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, I8, I8, I32, NHWC, NHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<I8, I8, I32, F32, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/pool2d_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/pool2d_fwd_instance_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I32  = int32_t;
+using F32  = float;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F8   = ck::f8_t;
+using NHWC = ck::tensor_layout::convolution::NHWC;
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ComputeDataType,
+          ReduceTensorOp ReduceOpId,
+          bool OutputIndex>
+using device_pool2d_fwd_nhwc_instances =
+    // clang-format off
+    std::tuple <
+        DevicePool2dFwd_NHWC_NHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 1, 1, 1>,
+        DevicePool2dFwd_NHWC_NHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 2, 1, 2>,
+        DevicePool2dFwd_NHWC_NHWC<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 4, 1, 4>
+               // clang-format on
+               >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
 set(DEVICE_POOL3D_FWD_INSTANCES)
 list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
                                        device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+                                        device_max_pool3d_fwd_ndhwc_f8_instance.cpp
+                                        device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
+                                        device_max_pool3d_fwd_ndhwc_i8_instance.cpp
+                                        device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
                                        device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
                                        device_max_pool3d_fwd_ndhwc_f32_instance.cpp
                                        device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "pool_fwd_instance_common.hpp"