Merge branch 'develop' into lwpck-537

7d3c5ea4 · zjing14 · GitHub · 981b8549 · 3b18f1e3 · 7d3c5ea4
Unverified Commit 7d3c5ea4 authored Jun 25, 2023 by zjing14 Committed by GitHub Jun 25, 2023
20 changed files
--- a/example/39_permute/common.hpp
+++ b/example/39_permute/common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/example/39_permute/permute_1xHxW_fp16.cpp
+++ b/example/39_permute/permute_1xHxW_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"

--- a/example/39_permute/permute_HxWx4_fp16.cpp
+++ b/example/39_permute/permute_HxWx4_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"

--- a/example/39_permute/permute_NxHxW_fp16.cpp
+++ b/example/39_permute/permute_NxHxW_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"

--- a/example/39_permute/run_permute_bundle_example.inc
+++ b/example/39_permute/run_permute_bundle_example.inc
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/example/39_permute/run_permute_element_example.inc
+++ b/example/39_permute/run_permute_element_example.inc
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/example/40_conv2d_fwd_quantization/CMakeLists.txt
+++ b/example/40_conv2d_fwd_quantization/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+   add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
+   add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp)
+   add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
+   add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
+   set(target 1)
+ endif()
+endforeach()
 # Conv perlayer quantization
 add_example_executable(example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
 # Conv perchannel quantization
 add_example_executable(example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp)
 # Conv + bias + relu perlayer quantization
 add_example_executable(example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
 # Conv + bias + relu perchannel quantization
 add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
+# Conv + bias + tanh perlayer quantization
+add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp)
+# Conv + bias + tanh perchannel quantization
+add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp)
\ No newline at end of file
--- a/example/40_conv2d_fwd_quantization/common.hpp
+++ b/example/40_conv2d_fwd_quantization/common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
@@ -76,6 +76,10 @@ using DeviceGroupedConvNDFwdInstance =
        5,                   // CThreadTransferSrcDstVectorDim
        4>;                  // CThreadTransferDstScalarPerVector
-#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc"
+#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
-int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); };
+int main()
+{
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+};
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
@@ -74,6 +74,11 @@ using DeviceGroupedConvNDFwdInstance =
        5,                   // CThreadTransferSrcDstVectorDim
        4>;                  // CThreadTransferDstScalarPerVector
-#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc"
+#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
-int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); }
+int main()
+{
+    float requant_scale       = 0.5f;
+    const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+using InDataType           = int8_t;
+using WeiDataType          = int8_t;
+using BiasDataType         = int32_t;
+using RequantScaleDataType = float;
+using AccDataType          = int32_t;
+using OutDataType          = int8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::TanH;
+using OutElementOp =
+    ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename RequantScaleLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
+        NDimSpatial,
+        InDataType,
+        WeiDataType,
+        ck::Tuple<BiasDataType, RequantScaleDataType>,
+        OutDataType,
+        AccDataType,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, RequantScaleLayout>,
+        OutLayout,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,            // ConvForwardSpecialization
+        GemmSpec,            // GemmSpecialization
+        256,                 // BlockSize
+        128,                 // MPerBlock
+        128,                 // NPerBlock
+        16,                  // K0PerBlock
+        4,                   // K1
+        4,                   // M1PerThread
+        4,                   // N1PerThread
+        1,                   // KPerThread
+        S<8, 2>,             // M1N1ThreadClusterM1Xs
+        S<8, 2>,             // M1N1ThreadClusterN1Xs
+        S<8, 1, 1, 4>,       // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
+        S<2, 1, 128, 1>,     // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
+        S<1, 2, 0, 3>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 2, 0, 3>,       // ABlockTransferSrcAccessOrder
+        S<4, 1, 1, 4>,       // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
+        S<1, 2, 0, 3>,       // ABlockTransferSrcVectorTensorContiguousDimOrder
+        S<1, 1, 1, 4>,       // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
+        S<8, 1, 1, 4>,       // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
+        S<2, 1, 128, 1>,     // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
+        S<1, 2, 0, 3>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 2, 0, 3>,       // BBlockTransferSrcAccessOrder
+        S<4, 1, 1, 4>,       // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
+        S<1, 2, 0, 3>,       // BBlockTransferSrcVectorTensorContiguousDimOrder
+        S<1, 1, 1, 4>,       // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
+        S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
+        5,                   // CThreadTransferSrcDstVectorDim
+        4>;                  // CThreadTransferDstScalarPerVector
+#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
+int main()
+{
+    float scale_z_inv         = 0.5f;
+    const auto out_element_op = OutElementOp{scale_z_inv, ActivationOp{}};
+    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+};
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+using InDataType   = int8_t;
+using WeiDataType  = int8_t;
+using BiasDataType = int32_t;
+using AccDataType  = int32_t;
+using OutDataType  = int8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::TanH;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
+        NDimSpatial,
+        InDataType,
+        WeiDataType,
+        ck::Tuple<BiasDataType>,
+        OutDataType,
+        AccDataType,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout>,
+        OutLayout,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,            // ConvForwardSpecialization
+        GemmSpec,            // GemmSpecialization
+        256,                 // BlockSize
+        128,                 // MPerBlock
+        128,                 // NPerBlock
+        16,                  // K0PerBlock
+        4,                   // K1
+        4,                   // M1PerThread
+        4,                   // N1PerThread
+        1,                   // KPerThread
+        S<8, 2>,             // M1N1ThreadClusterM1Xs
+        S<8, 2>,             // M1N1ThreadClusterN1Xs
+        S<8, 1, 1, 4>,       // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
+        S<2, 1, 128, 1>,     // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
+        S<1, 2, 0, 3>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 2, 0, 3>,       // ABlockTransferSrcAccessOrder
+        S<4, 1, 1, 4>,       // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
+        S<1, 2, 0, 3>,       // ABlockTransferSrcVectorTensorContiguousDimOrder
+        S<1, 1, 1, 4>,       // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
+        S<8, 1, 1, 4>,       // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
+        S<2, 1, 128, 1>,     // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
+        S<1, 2, 0, 3>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 2, 0, 3>,       // BBlockTransferSrcAccessOrder
+        S<4, 1, 1, 4>,       // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
+        S<1, 2, 0, 3>,       // BBlockTransferSrcVectorTensorContiguousDimOrder
+        S<1, 1, 1, 4>,       // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
+        S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
+        5,                   // CThreadTransferSrcDstVectorDim
+        4>;                  // CThreadTransferDstScalarPerVector
+#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
+int main()
+{
+    float scale_acc           = 0.5f;
+    float scale_z_inv         = 0.5f;
+    const auto out_element_op = OutElementOp{scale_z_inv, scale_acc, ActivationOp{}};
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
@@ -76,4 +76,8 @@ using DeviceGroupedConvNDFwdInstance =
 #include "run_conv2d_fwd_perchannel_quantization_example.inc"
-int main() { run_conv2d_fwd_perchannel_quantization_example(); }
+int main()
+{
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+    run_conv2d_fwd_perchannel_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
@@ -71,4 +71,9 @@ using DeviceGroupedConvNDFwdInstance =
 #include "run_conv2d_fwd_perlayer_quantization_example.inc"
-int main() { run_conv2d_fwd_perlayer_quantization_example(); }
+int main()
+{
+    float requant_scale       = 0.5f;
+    const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
+    run_conv2d_fwd_perlayer_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
@@ -80,6 +80,10 @@ using DeviceGroupedConvNDFwdInstance =
        S<1, 64, 1, 4>,
        8>;
-#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc"
+#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
-int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); };
+int main()
+{
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+};
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
@@ -78,6 +78,11 @@ using DeviceGroupedConvNDFwdInstance =
        S<1, 64, 1, 4>,
        8>;
-#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc"
+#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
-int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); }
+int main()
+{
+    float requant_scale       = 0.5f;
+    const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
@@ -80,4 +80,8 @@ using DeviceGroupedConvNDFwdInstance =
 #include "run_conv2d_fwd_perchannel_quantization_example.inc"
-int main() { run_conv2d_fwd_perchannel_quantization_example(); }
+int main()
+{
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+    run_conv2d_fwd_perchannel_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
@@ -75,4 +75,9 @@ using DeviceGroupedConvNDFwdInstance =
 #include "run_conv2d_fwd_perlayer_quantization_example.inc"
-int main() { run_conv2d_fwd_perlayer_quantization_example(); }
+int main()
+{
+    float requant_scale       = 0.5f;
+    const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
+    run_conv2d_fwd_perlayer_quantization_example(out_element_op);
+}
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 template <ck::index_t NDimSpatial,
@@ -167,7 +167,7 @@ bool run_grouped_conv_fwd(bool do_verification,
    return (pass ? 0 : 1);
 }
-int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
+int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op)
 {
    bool do_verification           = true;
    bool time_kernel               = true;
@@ -189,13 +189,12 @@ int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{ActivationOp{}};
-    using InLayout           = ck::tensor_layout::convolution::GNHWC;
+    using InLayout           = ck::tensor_layout::convolution::NHWGC;
-    using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+    using WeiLayout          = ck::tensor_layout::convolution::KYXGC;
    using BiasLayout         = ck::tensor_layout::convolution::G_K;
    using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-    using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+    using OutLayout          = ck::tensor_layout::convolution::NHWGK;
    const auto in_g_n_c_wis_desc =
        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);

--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -155,7 +155,7 @@ bool run_grouped_conv_fwd(bool do_verification,
    return (pass ? 0 : 1);
 }
-int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
+int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op)
 {
    bool do_verification           = true;
    bool time_kernel               = true;
@@ -177,12 +177,11 @@ int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
-    using InLayout   = ck::tensor_layout::convolution::GNHWC;
+    using InLayout   = ck::tensor_layout::convolution::NHWGC;
-    using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
+    using WeiLayout  = ck::tensor_layout::convolution::KYXGC;
    using BiasLayout = ck::tensor_layout::convolution::G_K;
-    using OutLayout  = ck::tensor_layout::convolution::GNHWK;
+    using OutLayout  = ck::tensor_layout::convolution::NHWGK;
    const auto in_g_n_c_wis_desc =
        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);