Add int4 example for convnd_fwd_bias_relu_add (#375)

* Add int4 example for convnd_fwd_bias_relu_add * Fix AddReluAdd for building without int4 support * Update CMakeLists.txt * Format * Convert int4 tensors for int8 kernel * Fix device memory allocation * Format * Format

Add int4 example for convnd_fwd_bias_relu_add (#375)
* Add int4 example for convnd_fwd_bias_relu_add * Fix AddReluAdd for building without int4 support * Update CMakeLists.txt * Format * Convert int4 tensors for int8 kernel * Fix device memory allocation * Format * Format
b73ae242 · Rostyslav Geyyer · GitHub · d520d0cf · b73ae242 · b73ae242
Unverified Commit b73ae242 authored Aug 25, 2022 by Rostyslav Geyyer Committed by GitHub Aug 25, 2022
8 changed files
--- a/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 PRIVATE utility)

 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 PRIVATE utility)

 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 PRIVATE utility)

 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 PRIVATE utility)
\ No newline at end of file
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int4 grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp)
+endif() # USE_BITINT_EXTENSION_INT4
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp
@@ -26,13 +26,16 @@ void print_helper_msg()
 }

 template <ck::index_t NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
+          typename InKernelDataType,
+          typename WeiKernelDataType,
          typename CShuffleDataType,
-          typename OutDataType,
+          typename OutKernelDataType,
          typename InElementOp,
          typename WeiElementOp,
          typename OutElementOp,
+          typename InUserDataType,
+          typename WeiUserDataType,
+          typename OutUserDataType,
          typename DeviceConvNDFwdInstance>
 int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
                                       int init_method,
@@ -47,12 +50,12 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
                                       const WeiElementOp& wei_element_op,
                                       const OutElementOp& out_element_op)
 {
-    Tensor<InDataType> in(in_g_n_c_wis_desc);
-    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
-    Tensor<OutDataType> bias(bias_g_n_k_wos_desc);
-    Tensor<OutDataType> residual(residual_g_n_k_wos_desc);
-    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
-    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    Tensor<InUserDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutUserDataType> bias(bias_g_n_k_wos_desc);
+    Tensor<OutUserDataType> residual(residual_g_n_k_wos_desc);
+    Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);

    std::cout << "in: " << in.mDesc << std::endl;
    std::cout << "wei: " << wei.mDesc << std::endl;
@@ -64,26 +67,38 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
    {
    case 0: break;
    case 1:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        in.GenerateTensorValue(GeneratorTensor_2<InUserDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiUserDataType>{-5, 5});
+        bias.GenerateTensorValue(GeneratorTensor_2<OutUserDataType>{-5, 5});
        break;
    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        in.GenerateTensorValue(GeneratorTensor_3<InUserDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiUserDataType>{-0.5, 0.5});
+        bias.GenerateTensorValue(GeneratorTensor_3<OutUserDataType>{-0.5, 0.5});
    }

-    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize());
-    DeviceMem residual_device_buf(sizeof(OutDataType) * residual.mDesc.GetElementSpaceSize());
-    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
-
+    DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(OutKernelDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    const Tensor<InKernelDataType> in_converted(in);
+    const Tensor<WeiKernelDataType> wei_converted(wei);
+    const Tensor<OutKernelDataType> bias_converted(bias);
+    const Tensor<OutKernelDataType> residual_converted(residual);
+
+    in_device_buf.ToDevice(in_converted.mData.data());
+    wei_device_buf.ToDevice(wei_converted.mData.data());
+    bias_device_buf.ToDevice(bias_converted.mData.data());
+    residual_device_buf.ToDevice(residual_converted.mData.data());
+#else  // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
    bias_device_buf.ToDevice(bias.mData.data());
    residual_device_buf.ToDevice(residual.mData.data());
+#endif //  CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4

    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
@@ -154,7 +169,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

    std::size_t flop      = conv_param.GetFlops();
-    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+    std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();

    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
    float gb_per_sec = num_btype / 1.E6 / avg_time;
@@ -168,8 +183,8 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
        Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);

        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
-                                                                     InDataType,
-                                                                     WeiDataType,
+                                                                     InUserDataType,
+                                                                     WeiUserDataType,
                                                                     CShuffleDataType,
                                                                     InElementOp,
                                                                     WeiElementOp,
@@ -196,10 +211,22 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,

        out_device_buf.FromDevice(out_device.mData.data());

+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        const Tensor<OutUserDataType> out_device_converted(out_device);
+
+        return ck::utils::check_err(out_device_converted.mData,
+                                    out_host.mData,
+                                    "Error: incorrect results!",
+                                    1e-5f,
+                                    1e-4f)
+                   ? 0
+                   : 1;
+#else  // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
        return ck::utils::check_err(
                   out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f)
                   ? 0
                   : 1;
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
    }

    return 0;

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
@@ -7,13 +7,19 @@

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

-using InDataType       = ck::bhalf_t;
-using WeiDataType      = ck::bhalf_t;
-using AccDataType      = float;
-using CShuffleDataType = float;
-using BiasDataType     = ck::bhalf_t;
-using ResidualDataType = ck::bhalf_t;
-using OutDataType      = ck::bhalf_t;
+// kernel data types
+using InKernelDataType       = ck::bhalf_t;
+using WeiKernelDataType      = ck::bhalf_t;
+using AccDataType            = float;
+using CShuffleDataType       = float;
+using BiasKernelDataType     = ck::bhalf_t;
+using ResidualKernelDataType = ck::bhalf_t;
+using OutKernelDataType      = ck::bhalf_t;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
@@ -7,13 +7,19 @@

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

-using InDataType       = ck::half_t;
-using WeiDataType      = ck::half_t;
-using AccDataType      = float;
-using CShuffleDataType = ck::half_t;
-using BiasDataType     = ck::half_t;
-using ResidualDataType = ck::half_t;
-using OutDataType      = ck::half_t;
+// kernel data types
+using InKernelDataType       = ck::half_t;
+using WeiKernelDataType      = ck::half_t;
+using AccDataType            = float;
+using CShuffleDataType       = ck::half_t;
+using BiasKernelDataType     = ck::half_t;
+using ResidualKernelDataType = ck::half_t;
+using OutKernelDataType      = ck::half_t;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
@@ -7,13 +7,19 @@

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

-using InDataType       = float;
-using WeiDataType      = float;
-using AccDataType      = float;
-using CShuffleDataType = float;
-using BiasDataType     = float;
-using ResidualDataType = float;
-using OutDataType      = float;
+// kernel data types
+using InKernelDataType       = float;
+using WeiKernelDataType      = float;
+using AccDataType            = float;
+using CShuffleDataType       = float;
+using BiasKernelDataType     = float;
+using ResidualKernelDataType = float;
+using OutKernelDataType      = float;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
@@ -7,13 +7,19 @@

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

-using InDataType       = int8_t;
-using WeiDataType      = int8_t;
-using AccDataType      = int32_t;
-using CShuffleDataType = int8_t;
-using BiasDataType     = int8_t;
-using ResidualDataType = int8_t;
-using OutDataType      = int8_t;
+// kernel data types
+using InKernelDataType       = int8_t;
+using WeiKernelDataType      = int8_t;
+using AccDataType            = int32_t;
+using CShuffleDataType       = int8_t;
+using BiasKernelDataType     = int8_t;
+using ResidualKernelDataType = int8_t;
+using OutKernelDataType      = int8_t;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,

--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -98,6 +98,18 @@ struct AddReluAdd
        int32_t c = b + x2;
        y         = c;
    }
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    template <>
+    __host__ __device__ constexpr void operator()<int4_t, int8_t, int4_t, int4_t>(
+        int4_t& y, const int8_t& x0, const int4_t& x1, const int4_t& x2) const
+    {
+        int32_t a = x0 + x1;
+        int32_t b = a > 0 ? a : 0;
+        int32_t c = b + x2;
+        y         = c;
+    }
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 };

 struct AddHardswishAdd