Merge branch 'develop' into add_fp16_wmma_conv_instance

4b70d68e · Chao Liu · GitHub · 212b9299 · f82bd593 · 4b70d68e
Unverified Commit 4b70d68e authored Jul 18, 2023 by Chao Liu Committed by GitHub Jul 18, 2023
20 changed files
--- a/include/ck/utility/reduction_common.hpp
+++ b/include/ck/utility/reduction_common.hpp
@@ -25,16 +25,4 @@ struct float_equal_zero
    };
 };

-template <index_t N>
-static constexpr __device__ index_t get_shift()
-{
-    return (get_shift<N / 2>() + 1);
-};
-
-template <>
-constexpr __device__ index_t get_shift<1>()
-{
-    return (0);
-}
-
 } // namespace ck
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -62,24 +62,6 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, half_t>(half_
    return type_convert<bhalf_t>(x_fp32);
 }

-// convert bfp16 to int32 via fp32
-template <>
-inline __host__ __device__ constexpr int32_t type_convert<int32_t, bhalf_t>(bhalf_t x)
-{
-    float x_fp32 = type_convert<float>(x);
-
-    return static_cast<int32_t>(x_fp32);
-}
-
-// convert int32 to bfp16 via fp32
-template <>
-inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int32_t>(int32_t x)
-{
-    float x_fp32 = static_cast<float>(x);
-
-    return type_convert<bhalf_t>(x_fp32);
-}
-
 // convert bfp16 to int8 via fp32
 template <>
 inline __host__ __device__ constexpr int8_t type_convert<int8_t, bhalf_t>(bhalf_t x)

--- a/include/ck/utility/workgroup_synchronization.hpp
+++ b/include/ck/utility/workgroup_synchronization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck/host_utility/hip_check_error.hpp"
+
+namespace ck {
+
+// Initialization flag of Barrier object, can be any value except for zero
+static constexpr int BarrierInitFlag = 0x7856;
+
+// 1) only the first thread-block in the synchronizaton group is supposed to call this function. It
+// is the responsibility of the user to ensure the two integer values in p_control_bits are zeros
+// before calling gms_init().
+// 2) Aftercalling gms_reset(), the two integer values in p_control_bits will be zeros, so no
+// repetitious initialization of p_control_bits buffer is required
+static __device__ void gms_init(int NumWarps, int* p_control_bits)
+{
+    union
+    {
+        int two32[2];
+        unsigned long one64;
+    } regs;
+
+    regs.two32[0] = BarrierInitFlag;
+    regs.two32[1] = NumWarps;
+
+    if(threadIdx.x == 0)
+        atomicCAS(reinterpret_cast<unsigned long*>(p_control_bits), 0, regs.one64);
+};
+
+// all the workgroups in the synchronization group is supposed to call this function
+static __device__ void gms_barrier(int* p_control_bits)
+{
+    constexpr int mask = warpSize - 1;
+
+    if((threadIdx.x & mask) == 0)
+    {
+        // ensure the barrier object is initialized
+        do
+        {
+            const int r0 = __atomic_load_n(&p_control_bits[0], __ATOMIC_RELAXED);
+
+            if(r0 == BarrierInitFlag)
+                break;
+
+        } while(true);
+
+        // go ahead toward the barrier line
+        atomicSub(&p_control_bits[1], 1);
+
+        // wait until all warps have arrived
+        do
+        {
+            const int r1 = __atomic_load_n(&p_control_bits[1], __ATOMIC_RELAXED);
+
+            if(r1 == 0)
+                break;
+
+        } while(true);
+    };
+};
+
+// 1) Only the first thread-block in the synchronizaton group is supposed to call this function.
+// 2) Aftercalling gms_reset(), the two integer values in p_control_bits will be zeros, so no
+// repetitious initialization of p_control_bits buffer is required
+static __device__ void gms_reset(int* p_control_bits)
+{
+    // reset the barrier object
+    if(threadIdx.x == 0)
+        (void)atomicCAS(&p_control_bits[0], BarrierInitFlag, 0);
+};
+
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
@@ -39,7 +39,7 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(
    std::vector<std::unique_ptr<
        DeviceConvBwdData<1, NWC, KXC, NWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
-
+#ifdef __int8__
 void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<1,
                                                  NWC,
@@ -51,7 +51,7 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
-
+#endif
 // conv2d backward data
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
@@ -88,7 +88,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
-
+#ifdef __int8__
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
                                                  NHWC,
@@ -100,7 +100,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
-
+#endif
 // conv2d dl
 void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
@@ -125,7 +125,7 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
-
+#ifdef __int8__
 void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
                                                  NHWC,
@@ -137,6 +137,7 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
 // conv3d backward data
 void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<3,
@@ -173,7 +174,7 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
-
+#ifdef __int8__
 void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<3,
                                                  NDHWC,
@@ -185,7 +186,7 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
-
+#endif
 template <ck::index_t NumDimSpatial,
          typename InLayout,
          typename WeiLayout,
@@ -239,11 +240,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(op_ptrs);
            }
+#ifdef __int8__
            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                              is_same_v<OutDataType, int8_t>)
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(op_ptrs);
            }
+#endif
        }
        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
                          is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
@@ -266,12 +269,14 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(op_ptrs);
            }
+#ifdef __int8__
            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                              is_same_v<OutDataType, int8_t>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
            }
+#endif
        }
        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWC> &&
                          is_same_v<WeiLayout, KZYXC> && is_same_v<OutLayout, NDHWK>)
@@ -292,11 +297,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(op_ptrs);
            }
+#ifdef __int8__
            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                              is_same_v<OutDataType, int8_t>)
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(op_ptrs);
            }
+#endif
        }

        return op_ptrs;

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -77,7 +77,7 @@ void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
-
+#ifdef __int8__
 void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
@@ -118,6 +118,27 @@ void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances(
        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
        instances);

+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+
 void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -183,26 +204,6 @@ void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(
        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
 void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -388,6 +389,7 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances(op_ptrs);
            }
        }
+#ifdef __int8__
        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
                          is_same_v<CDataType, int8_t>)
        {
@@ -420,7 +422,7 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances(op_ptrs);
            }
        }
-
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -91,6 +91,42 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                           PassThrough,
                                                           PassThrough>>>& instances);

+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 // conv3d backward weight
 void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
@@ -162,8 +198,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

-        if constexpr(NumDimSpatial == 1 && is_same_v<InLayout, GNWC> &&
-                     is_same_v<WeiLayout, GKXC> && is_same_v<OutLayout, GNWK>)
+        if constexpr(NumDimSpatial == 1)
+        {
+            if constexpr(is_same_v<InLayout, GNWC> && is_same_v<WeiLayout, GKXC> &&
+                         is_same_v<OutLayout, GNWK>)
            {
                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                             is_same_v<OutDataType, float>)
@@ -175,35 +213,68 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                {
                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances(op_ptrs);
                }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<WeiDataType, float> &&
                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances(
                        op_ptrs);
                }
            }
-        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
-                          is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, GNHWK>)
+        }
+        else if constexpr(NumDimSpatial == 2)
+        {
+            if constexpr(is_same_v<InLayout, GNHWC> && is_same_v<WeiLayout, GKYXC> &&
+                         is_same_v<OutLayout, GNHWK>)
            {
                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                             is_same_v<OutDataType, float>)
                {
-                add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
+                        op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                                  is_same_v<OutDataType, half_t>)
                {
-                add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+                        op_ptrs);
                }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<WeiDataType, float> &&
                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances(
                        op_ptrs);
                }
            }
-        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, GNDHWC> &&
-                          is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, GNDHWK>)
+            else if constexpr(is_same_v<InLayout, NHWGC> && is_same_v<WeiLayout, GKYXC> &&
+                              is_same_v<OutLayout, NHWGK>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, float>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                                  is_same_v<OutDataType, half_t>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<WeiDataType, float> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances(
+                        op_ptrs);
+                }
+            }
+        }
+        else if constexpr(NumDimSpatial == 3)
+        {
+            if(is_same_v<InLayout, GNDHWC> && is_same_v<WeiLayout, GKZYXC> &&
+               is_same_v<OutLayout, GNDHWK>)
            {
                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                             is_same_v<OutDataType, float>)
@@ -217,13 +288,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
                        op_ptrs);
                }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<WeiDataType, float> &&
                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances(
                        op_ptrs);
                }
            }
+        }

        return op_ptrs;
    }

--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>


--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>


--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>


--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp