Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel...

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel into barkocot/fix-Filter1x1Pad0-check

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel...
Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel into barkocot/fix-Filter1x1Pad0-check
5641b889 · Bartlomiej Kocot · 4ecef37f · f2398f61 · 5641b889 · 5641b889
Commit 5641b889 authored Nov 14, 2023 by Bartlomiej Kocot
20 changed files
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -10,18 +10,18 @@ namespace device {
 namespace instance {
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                              NDHWGC,
+                                                                NDHWGC,
-                                                              GKZYXC,
+                                                                GKZYXC,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              NDHWGK,
+                                                                NDHWGK,
-                                                              BF16,
+                                                                BF16,
-                                                              BF16,
+                                                                BF16,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              BF16,
+                                                                BF16,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough>>>& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv_fwd_xdl_bf16_instances<3,

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp
@@ -10,19 +10,19 @@ namespace device {
 namespace instance {
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_f8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                              NDHWGC,
+                                                                NDHWGC,
-                                                              GKZYXC,
+                                                                GKZYXC,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              NDHWGK,
+                                                                NDHWGK,
-                                                              F16,
+                                                                F16,
-                                                              F16,
+                                                                F16,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              F16,
+                                                                F16,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              F8>>>& instances)
+                                                                F8>>>& instances)
 {
    add_device_operation_instances(
        instances,

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -10,18 +10,18 @@ namespace device {
 namespace instance {
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                              NDHWGC,
+                                                                NDHWGC,
-                                                              GKZYXC,
+                                                                GKZYXC,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              NDHWGK,
+                                                                NDHWGK,
-                                                              F16,
+                                                                F16,
-                                                              F16,
+                                                                F16,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              F16,
+                                                                F16,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough>>>& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv_fwd_xdl_f16_instances<3,

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -10,18 +10,18 @@ namespace device {
 namespace instance {
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                              NDHWGC,
+                                                                NDHWGC,
-                                                              GKZYXC,
+                                                                GKZYXC,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              NDHWGK,
+                                                                NDHWGK,
-                                                              F32,
+                                                                F32,
-                                                              F32,
+                                                                F32,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              F32,
+                                                                F32,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough>>>& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv_fwd_xdl_f32_instances<3,

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
@@ -9,18 +9,18 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                              NDHWGC,
+                                                                NDHWGC,
-                                                              GKZYXC,
+                                                                GKZYXC,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              NDHWGK,
+                                                                NDHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough>>>& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv_fwd_xdl_int8_instances<3,

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_ab/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_ab/CMakeLists.txt
+set(GROUPED_CONV3D_FWD_SCALEADD_AB
+   xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp)
+add_instance_library(device_grouped_conv3d_fwd_scaleadd_ab_instance ${GROUPED_CONV3D_FWD_SCALEADD_AB})
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_ab/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_ab/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_ab_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                ck::Tuple<BF16, BF16>,
+                                                                ck::Tuple<BF16, BF16>,
+                                                                ck::Tuple<>,
+                                                                BF16,
+                                                                ScaleAdd,
+                                                                ScaleAdd,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_bf16_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               NDHWGK,
+                                                               ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_bf16_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               NDHWGK,
+                                                               ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_bf16_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               NDHWGK,
+                                                               ConvFwd1x1S1P0>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_ab/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_ab/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_ab_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                ck::Tuple<F16, F16>,
+                                                                ck::Tuple<F16, F16>,
+                                                                ck::Tuple<>,
+                                                                F16,
+                                                                ScaleAdd,
+                                                                ScaleAdd,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_f16_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              NDHWGK,
+                                                              ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_f16_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              NDHWGK,
+                                                              ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_f16_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              NDHWGK,
+                                                              ConvFwd1x1S1P0>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_ab/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_ab/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_ab_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                ck::Tuple<F32, F32>,
+                                                                ck::Tuple<F32, F32>,
+                                                                ck::Tuple<>,
+                                                                F32,
+                                                                ScaleAdd,
+                                                                ScaleAdd,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_f32_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              NDHWGK,
+                                                              ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_f32_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              NDHWGK,
+                                                              ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_f32_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              NDHWGK,
+                                                              ConvFwd1x1S1P0>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_ab/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_ab/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_ab_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                ck::Tuple<int8_t, int8_t>,
+                                                                ck::Tuple<int8_t, int8_t>,
+                                                                ck::Tuple<>,
+                                                                int8_t,
+                                                                ScaleAdd,
+                                                                ScaleAdd,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_int8_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               NDHWGK,
+                                                               ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_int8_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               NDHWGK,
+                                                               ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scaleadd_ab_int8_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               NDHWGK,
+                                                               ConvFwd1x1S1P0>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -10,18 +10,18 @@ namespace device {
 namespace instance {
 void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                              NDHWGC,
+                                                                NDHWGC,
-                                                              GKZYXC,
+                                                                GKZYXC,
-                                                              ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, NDHWGK>,
-                                                              NDHWGK,
+                                                                NDHWGK,
-                                                              BF16,
+                                                                BF16,
-                                                              BF16,
+                                                                BF16,
-                                                              ck::Tuple<BF16, BF16>,
+                                                                ck::Tuple<BF16, BF16>,
-                                                              BF16,
+                                                                BF16,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              ScaleAddScaleAddRelu>>>& instances)
+                                                                ScaleAddScaleAddRelu>>>& instances)
 {
    add_device_operation_instances(
        instances,

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -10,18 +10,18 @@ namespace device {
 namespace instance {
 void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                              NDHWGC,
+                                                                NDHWGC,
-                                                              GKZYXC,
+                                                                GKZYXC,
-                                                              ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, NDHWGK>,
-                                                              NDHWGK,
+                                                                NDHWGK,
-                                                              F16,
+                                                                F16,
-                                                              F16,
+                                                                F16,
-                                                              ck::Tuple<half_t, half_t>,
+                                                                ck::Tuple<half_t, half_t>,
-                                                              F16,
+                                                                F16,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              ScaleAddScaleAddRelu>>>& instances)
+                                                                ScaleAddScaleAddRelu>>>& instances)
 {
    add_device_operation_instances(
        instances,

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -10,18 +10,18 @@ namespace device {
 namespace instance {
 void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                              NDHWGC,
+                                                                NDHWGC,
-                                                              GKZYXC,
+                                                                GKZYXC,
-                                                              ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, NDHWGK>,
-                                                              NDHWGK,
+                                                                NDHWGK,
-                                                              F32,
+                                                                F32,
-                                                              F32,
+                                                                F32,
-                                                              ck::Tuple<F32, F32>,
+                                                                ck::Tuple<F32, F32>,
-                                                              F32,
+                                                                F32,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              ScaleAddScaleAddRelu>>>& instances)
+                                                                ScaleAddScaleAddRelu>>>& instances)
 {
    add_device_operation_instances(
        instances,

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
@@ -9,18 +9,18 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                              NDHWGC,
+                                                                NDHWGC,
-                                                              GKZYXC,
+                                                                GKZYXC,
-                                                              ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, NDHWGK>,
-                                                              NDHWGK,
+                                                                NDHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              ck::Tuple<F32, F32>,
+                                                                ck::Tuple<F32, F32>,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              ScaleAddScaleAddRelu>>>& instances)
+                                                                ScaleAddScaleAddRelu>>>& instances)
 {
    add_device_operation_instances(
        instances,

--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
@@ -8,18 +8,18 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_GK_Tuple,
+                                                                GK_GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_F32_Tuple,
+                                                                I32_F32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Mul2_Clamp>>>& instances)
+                                                                Add_Mul2_Clamp>>>& instances)
 {
    // dl
    add_device_operation_instances(instances,
@@ -52,18 +52,18 @@ void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
 }
 void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_GK_Tuple,
+                                                                GK_GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_F32_Tuple,
+                                                                I32_F32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Relu_Mul2_Clamp>>>& instances)
+                                                                Add_Relu_Mul2_Clamp>>>& instances)
 {
    // dl
    add_device_operation_instances(instances,
@@ -96,18 +96,19 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
 }
 void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_GK_Tuple,
+                                                                GK_GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_F32_Tuple,
+                                                                I32_F32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Mul2_TanH_Mul_Clamp>>>& instances)
+                                                                Add_Mul2_TanH_Mul_Clamp>>>&
+        instances)
 {
    // dl
    add_device_operation_instances(instances,

--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
@@ -8,18 +8,18 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_Tuple,
+                                                                GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_Tuple,
+                                                                I32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Mul_Clamp>>>& instances)
+                                                                Add_Mul_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
 }
 void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_Tuple,
+                                                                GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_Tuple,
+                                                                I32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Relu_Mul_Clamp>>>& instances)
+                                                                Add_Relu_Mul_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
@@ -96,18 +96,19 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
 }
 void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_Tuple,
+                                                                GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_Tuple,
+                                                                I32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Mul_TanH_Mul_Clamp>>>& instances)
+                                                                Add_Mul_TanH_Mul_Clamp>>>&
+        instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_dl_int8_instances<NHWGC,

--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
@@ -8,18 +8,18 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 void add_device_conv2d_dl_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_Tuple,
+                                                                GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              F32_Tuple,
+                                                                F32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Mul2_Clamp>>>& instances)
+                                                                Mul2_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_dl_perchannel_quantization_int8_instances(
 }
 void add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_Tuple,
+                                                                GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              F32_Tuple,
+                                                                F32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Relu_Mul2_Clamp>>>& instances)
+                                                                Relu_Mul2_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_dl_int8_instances<NHWGC,

--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
@@ -8,18 +8,18 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 void add_device_conv2d_dl_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Mul_Clamp>>>& instances)
+                                                                Mul_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_dl_perlayer_quantization_int8_instances(
 }
 void add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Relu_Mul_Clamp>>>& instances)
+                                                                Relu_Mul_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_dl_int8_instances<NHWGC,

--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
@@ -8,18 +8,18 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_GK_Tuple,
+                                                                GK_GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_F32_Tuple,
+                                                                I32_F32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Mul2_Clamp>>>& instances)
+                                                                Add_Mul2_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
 }
 void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_GK_Tuple,
+                                                                GK_GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_F32_Tuple,
+                                                                I32_F32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Relu_Mul2_Clamp>>>& instances)
+                                                                Add_Relu_Mul2_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -94,18 +94,19 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
 }
 void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_GK_Tuple,
+                                                                GK_GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_F32_Tuple,
+                                                                I32_F32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Mul2_TanH_Mul_Clamp>>>& instances)
+                                                                Add_Mul2_TanH_Mul_Clamp>>>&
+        instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,

--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
@@ -8,18 +8,18 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_Tuple,
+                                                                GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_Tuple,
+                                                                I32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Mul_Clamp>>>& instances)
+                                                                Add_Mul_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
 }
 void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_Tuple,
+                                                                GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_Tuple,
+                                                                I32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Relu_Mul_Clamp>>>& instances)
+                                                                Add_Relu_Mul_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -96,18 +96,19 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
 }
 void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_Tuple,
+                                                                GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              I32_Tuple,
+                                                                I32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Add_Mul_TanH_Mul_Clamp>>>& instances)
+                                                                Add_Mul_TanH_Mul_Clamp>>>&
+        instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,