Merge branch 'develop' of...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_int8_wmma_example_instance

Merge branch 'develop' of...
Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_int8_wmma_example_instance
e24f37fb · aska-0096 · 9697ad4e · 85e2e1e2 · 9697ad4e · e24f37fb
Commit e24f37fb authored Sep 12, 2023 by aska-0096
20 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <tuple>
-
-#include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
-#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-template <index_t Rank, index_t Reduce>
-using device_softmax_i8_i8_instances = std::tuple<
-    // clang-format off
-    //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    // fallback kernel
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,               16,              1,               1,              1>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  4,                 64,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               32,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               64,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               32,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               64,              1,              16,             16>,
-    // Reduction on middle dimensions
-    // InSrcVectorDim is 0 since we want to coalesce reads on M dimension
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                8,                8,              0,               1,              1>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                 32,                  8,               32,                8,              0,              16,              8>
-    // clang-format on
-    >;
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
--- a/library/include/ck/library/utility/algorithm.hpp
+++ b/library/include/ck/library/utility/algorithm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -65,7 +65,11 @@ check_err(const Range& out,
    }
    if(!res)
    {
-        std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+        const float error_percent =
+            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
+        std::cerr << "max err: " << max_err;
+        std::cerr << ", number of errors: " << err_count;
+        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
    }
    return res;
 }
@@ -112,7 +116,11 @@ check_err(const Range& out,
    }
    if(!res)
    {
-        std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+        const float error_percent =
+            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
+        std::cerr << "max err: " << max_err;
+        std::cerr << ", number of errors: " << err_count;
+        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
    }
    return res;
 }
@@ -158,7 +166,11 @@ check_err(const Range& out,
    }
    if(!res)
    {
-        std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+        const float error_percent =
+            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
+        std::cerr << "max err: " << max_err;
+        std::cerr << ", number of errors: " << err_count;
+        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
    }
    return res;
 }
@@ -209,7 +221,11 @@ check_err(const Range& out,
    }
    if(!res)
    {
-        std::cerr << "max err: " << max_err << std::endl;
+        const float error_percent =
+            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
+        std::cerr << "max err: " << max_err;
+        std::cerr << ", number of errors: " << err_count;
+        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
    }
    return res;
 }

--- a/library/include/ck/library/utility/conv_common.hpp
+++ b/library/include/ck/library/utility/conv_common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
+++ b/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/include/ck/library/utility/convolution_parameter.hpp
+++ b/library/include/ck/library/utility/convolution_parameter.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/include/ck/library/utility/device_memory.hpp
+++ b/library/include/ck/library/utility/device_memory.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -20,12 +20,15 @@ __global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
 */
 struct DeviceMem
 {
-    DeviceMem() = delete;
+    DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
    DeviceMem(std::size_t mem_size);
+    void Realloc(std::size_t mem_size);
    void* GetDeviceBuffer() const;
    std::size_t GetBufferSize() const;
    void ToDevice(const void* p) const;
+    void ToDevice(const void* p, const std::size_t cpySize) const;
    void FromDevice(void* p) const;
+    void FromDevice(void* p, const std::size_t cpySize) const;
    void SetZero() const;
    template <typename T>
    void SetValue(T x) const;

--- a/library/include/ck/library/utility/fill.hpp
+++ b/library/include/ck/library/utility/fill.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -102,9 +102,10 @@ struct FillMonotonicSeq
    }

    template <typename ForwardRange>
-    auto operator()(ForwardRange&& range) const -> std::void_t<decltype(
-        std::declval<const FillMonotonicSeq&>()(std::begin(std::forward<ForwardRange>(range)),
-                                                std::end(std::forward<ForwardRange>(range))))>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillMonotonicSeq&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
    {
        (*this)(std::begin(std::forward<ForwardRange>(range)),
                std::end(std::forward<ForwardRange>(range)));

--- a/library/include/ck/library/utility/host_common_util.hpp
+++ b/library/include/ck/library/utility/host_common_util.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/include/ck/library/utility/host_gemm.hpp
+++ b/library/include/ck/library/utility/host_gemm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/include/ck/library/utility/host_tensor.hpp
+++ b/library/include/ck/library/utility/host_tensor.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -13,6 +13,7 @@

 #include "ck/utility/data_type.hpp"
 #include "ck/utility/span.hpp"
+#include "ck/utility/type_convert.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/ranges.hpp"

--- a/library/include/ck/library/utility/host_tensor_generator.hpp
+++ b/library/include/ck/library/utility/host_tensor_generator.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -95,6 +95,22 @@ struct GeneratorTensor_2<int8_t>
    }
 };

+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
+template <>
+struct GeneratorTensor_2<ck::f8_t>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    ck::f8_t operator()(Is...)
+    {
+        float tmp = (std::rand() % (max_value - min_value)) + min_value;
+        return ck::type_convert<ck::f8_t>(tmp);
+    }
+};
+#endif
+
 template <typename T>
 struct GeneratorTensor_3
 {
@@ -127,6 +143,25 @@ struct GeneratorTensor_3<ck::bhalf_t>
    }
 };

+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
+template <>
+struct GeneratorTensor_3<ck::f8_t>
+{
+    float min_value = 0;
+    float max_value = 1;
+
+    template <typename... Is>
+    ck::f8_t operator()(Is...)
+    {
+        float tmp = float(std::rand()) / float(RAND_MAX);
+
+        float fp32_tmp = min_value + tmp * (max_value - min_value);
+
+        return ck::type_convert<ck::f8_t>(fp32_tmp);
+    }
+};
+#endif
+
 template <typename T>
 struct GeneratorTensor_4
 {

--- a/library/include/ck/library/utility/iterator.hpp
+++ b/library/include/ck/library/utility/iterator.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/include/ck/library/utility/literals.hpp
+++ b/library/include/ck/library/utility/literals.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/include/ck/library/utility/numeric.hpp
+++ b/library/include/ck/library/utility/numeric.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/include/ck/library/utility/ranges.hpp
+++ b/library/include/ck/library/utility/ranges.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -12,9 +12,46 @@ set(CK_DEVICE_INSTANCES)
 FOREACH(subdir_path ${dir_list})
 set(target_dir)
 IF(IS_DIRECTORY "${subdir_path}")
-    get_filename_component(target_dir ${subdir_path} NAME)
-    add_subdirectory(${target_dir})
-    list(APPEND CK_DEVICE_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+    set(cmake_instance)
+    file(READ "${subdir_path}/CMakeLists.txt" cmake_instance)
+    set(add_inst 0)
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8")
+            #message("fp8 instance found!")
+            set(add_inst 1)
+    endif()
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16")
+            #message("fp16 instance found!")
+            set(add_inst 1)
+    endif()
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32")
+            #message("fp32 instance found!")
+            set(add_inst 1)
+    endif()
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64")
+            #message("fp64 instance found!")
+            set(add_inst 1)
+    endif()
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16")
+            #message("bf16 instance found!")
+            set(add_inst 1)
+    endif()
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8")
+            #message("int8 instance found!")
+            set(add_inst 1)
+    endif()
+    if(NOT "${cmake_instance}" MATCHES "DTYPES" OR NOT DEFINED DTYPES)
+            #message("instance should be built for all types!")
+            set(add_inst 1)
+    endif()
+    if("${cmake_instance}" MATCHES "ONLY DL_KERNELS" AND NOT DEFINED DL_KERNELS)
+	    message("Found only dl instances, but DL_KERNELS is not set. Skipping.")
+            set(add_inst 0)
+    endif()
+    if(add_inst EQUAL 1)
+      get_filename_component(target_dir ${subdir_path} NAME)
+      add_subdirectory(${target_dir})
+      list(APPEND CK_DEVICE_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+    endif()
 ENDIF()
 ENDFOREACH()


--- a/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt
+set(DEVICE_AVGPOOL_BWD_INSTANCES)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_avg_pool3d_bwd_ndhwc_f16_instance.cpp)
+endif()
+if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_avg_pool3d_bwd_ndhwc_bf16_instance.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_avg_pool3d_bwd_ndhwc_f32_instance.cpp)
+endif()
+add_instance_library(device_avg_pool3d_bwd_instance ${DEVICE_AVGPOOL_BWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/avg_pool3d_bwd_ndhwc_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/avg_pool3d_bwd_ndhwc_instance_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I32   = int32_t;
+using F16   = ck::half_t;
+using BF16  = ck::bhalf_t;
+using F32   = float;
+using NDHWC = ck::tensor_layout::convolution::NDHWC;
+
+using device_avgpool_bwd_ndhwc_f16_instances =
+    // clang-format off
+    std::tuple <
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<F16, F16, F32, 256, 256, 1, 1, 1, 1>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<F16, F16, F32, 256, 256, 1, 2, 2, 2>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<F16, F16, F32, 256, 256, 1, 4, 4, 4>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<F16, F16, F32, 256, 256, 1, 8, 8, 8>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<F16, F16, F32, 256, 32, 8, 8, 8, 8>
+               // clang-format on
+               >;
+
+using device_avgpool_bwd_ndhwc_bf16_instances =
+    // clang-format off
+    std::tuple <
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<BF16, BF16, F32, 256, 256, 1, 1, 1, 1>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<BF16, BF16, F32, 256, 256, 1, 2, 2, 2>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<BF16, BF16, F32, 256, 256, 1, 4, 4, 4>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<BF16, BF16, F32, 256, 256, 1, 8, 8, 8>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<BF16, BF16, F32, 256, 32, 8, 8, 8, 8>
+               // clang-format on
+               >;
+
+using device_avgpool_bwd_ndhwc_f32_instances =
+    // clang-format off
+    std::tuple <
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<F32, F32, F32, 256, 256, 1, 1, 1, 1>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<F32, F32, F32, 256, 256, 1, 2, 2, 2>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<F32, F32, F32, 256, 256, 1, 4, 4, 4>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<F32, F32, F32, 256, 256, 1, 8, 8, 8>,
+        DeviceAvgPool3dBwd_NDHWC_NDHWC<F32, F32, F32, 256, 32, 8, 8, 8, 8>
+               // clang-format on
+               >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck