Merge branch 'develop' into lwpck-1815

408534d4 · Rostyslav Geyyer · GitHub · a8efb3f0 · da214a5a · 408534d4
Unverified Commit 408534d4 authored Aug 09, 2024 by Rostyslav Geyyer Committed by GitHub Aug 09, 2024
20 changed files
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               F16,
+                                               F16,
+                                               DsDataType,
+                                               F16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               F16,
+                                               F16,
+                                               DsDataType,
+                                               F16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               F16,
+                                               F16,
+                                               DsDataType,
+                                               F16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               F16,
+                                               F16,
+                                               DsDataType,
+                                               F16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               F16,
+                                               F16,
+                                               DsDataType,
+                                               F16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               F16,
+                                               F16,
+                                               DsDataType,
+                                               F16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               F16,
+                                               F16,
+                                               DsDataType,
+                                               F16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               F16,
+                                               F16,
+                                               DsDataType,
+                                               F16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -9,11 +9,11 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
   xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
   xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
-   # merged groups
+   # large tensor
   # NHWGC, GKYXC, NHWGK
-   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
-   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+   xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+   xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+   xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.cpp
   #mem
   # NHWGC, GKYXC, NHWGK
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -2,44 +2,35 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                NHWGC,
                                                                GKYXC,
                                                                Empty_Tuple,
                                                                NHWGK,
-                                                                F32,
-                                                                F32,
+                                                                BF16,
+                                                                BF16,
                                                                Empty_Tuple,
-                                                                F32,
+                                                                BF16,
                                                                PassThrough,
                                                                PassThrough,
                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(
        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
+        device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<2,
                                                                NHWGC,
                                                                GKYXC,
                                                                Empty_Tuple,
                                                                NHWGK,
                                                                ConvFwdDefault>{});
-
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                ConvFwd3x3>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -2,14 +2,14 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
+void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                NHWGC,
                                                                GKYXC,
@@ -25,21 +25,12 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_insta
 {
    add_device_operation_instances(
        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                ConvFwdDefault>{});
-
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                ConvFwd3x3>{});
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwdDefault>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -2,44 +2,35 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                NHWGC,
                                                                GKYXC,
                                                                Empty_Tuple,
                                                                NHWGK,
-                                                                BF16,
-                                                                BF16,
+                                                                F32,
+                                                                F32,
                                                                Empty_Tuple,
-                                                                BF16,
+                                                                F32,
                                                                PassThrough,
                                                                PassThrough,
                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(
        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwdDefault>{});
-
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwd3x3>{});
+        device_grouped_conv_fwd_xdl_large_tensor_f32_instances<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwdDefault>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -9,9 +9,9 @@ set(GROUPED_CONV3D_FWD
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp

-   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
-   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+   xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+   xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp

   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"

 namespace ck {
@@ -9,36 +9,28 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+void add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NDHWGC,
                                                                GKZYXC,
                                                                Empty_Tuple,
                                                                NDHWGK,
-                                                                F32,
-                                                                F32,
+                                                                BF16,
+                                                                BF16,
                                                                Empty_Tuple,
-                                                                F32,
+                                                                BF16,
                                                                PassThrough,
                                                                PassThrough,
                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(
        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+        device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<3,
                                                                NDHWGC,
                                                                GKZYXC,
                                                                Empty_Tuple,
                                                                NDHWGK,
                                                                ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                ConvFwd3x3>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"

 namespace ck {
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+void add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NDHWGC,
                                                                GKZYXC,
@@ -25,20 +25,12 @@ void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_in
 {
    add_device_operation_instances(
        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                ConvFwd3x3>{});
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwdDefault>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_add/CMakeLists.txt
+# ONLY XDL_KERNELS
+set(GROUPED_CONV3D_FWD_CONVSCALE_ADD
+   xdl/device_grouped_conv3d_fwd_xdl_convscale_add_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp)
+
+add_instance_library(device_grouped_conv3d_fwd_convscale_add_instance ${GROUPED_CONV3D_FWD_CONVSCALE_ADD})
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_binary_outelementop_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+using ConvScaleAdd = ck::tensor_operation::element_wise::ConvScaleAdd;
+
+void add_device_grouped_conv3d_fwd_xdl_convscale_add_ndhwgc_gkzyxc_ndhwgk_f8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NDHWGC,
                                                                GKZYXC,
-                                                                Empty_Tuple,
+                                                                ck::Tuple<NDHWGK>,
                                                                NDHWGK,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
+                                                                F8,
+                                                                F8,
+                                                                ck::Tuple<F32>,
+                                                                F8,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                ConvScaleAdd,
+                                                                F8,
+                                                                F8>>>& instances)
 {
    add_device_operation_instances(
        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwdDefault>{});
+        device_grouped_conv_fwd_xdl_binary_outelementop_f8_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     ck::Tuple<NDHWGK>,
+                                                                     NDHWGK,
+                                                                     ConvFwdDefault,
+                                                                     ConvScaleAdd>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_binary_outelementop_f8_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     ck::Tuple<NDHWGK>,
+                                                                     NDHWGK,
+                                                                     ConvFwd1x1P0,
+                                                                     ConvScaleAdd>{});
    add_device_operation_instances(
        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwd3x3>{});
+        device_grouped_conv_fwd_xdl_binary_outelementop_f8_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     ck::Tuple<NDHWGK>,
+                                                                     NDHWGK,
+                                                                     ConvFwd1x1S1P0,
+                                                                     ConvScaleAdd>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+set(FMHA_CPP_FOLDER ${CMAKE_CURRENT_BINARY_DIR})
+set(FMHA_SRC_FOLDER ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/)
+set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/)
+# python stuff
+find_package(PythonInterp 3 REQUIRED)
+
+rocm_install(DIRECTORY ${CK_TILE_SRC_FOLDER} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile)
+
+rocm_install(FILES
+    "${FMHA_SRC_FOLDER}/fmha_fwd.hpp"
+    "${FMHA_SRC_FOLDER}/bias.hpp"
+    "${FMHA_SRC_FOLDER}/mask.hpp"
+    DESTINATION include/ck_tile/ops
+)
+
+# header for building lib
+file(COPY ${FMHA_SRC_FOLDER}/fmha_fwd.hpp DESTINATION ${FMHA_CPP_FOLDER})
+file(COPY ${FMHA_SRC_FOLDER}/bias.hpp DESTINATION ${FMHA_CPP_FOLDER})
+file(COPY ${FMHA_SRC_FOLDER}/mask.hpp DESTINATION ${FMHA_CPP_FOLDER})
+
+# generate a list of kernels, but not actually emit files at config stage
+execute_process(
+  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
+  --list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt
+)
+file(STRINGS ${FMHA_CPP_FOLDER}/blob_list.txt FMHA_FWD_GEN_BLOBS)
+
+# actually generate the cpp files
+add_custom_command(
+  OUTPUT ${FMHA_FWD_GEN_BLOBS}
+  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
+  --output_dir ${FMHA_CPP_FOLDER}
+  COMMENT "Generating mha kernel (cpp) files now ..."
+  VERBATIM
+)
+
+# This is done to remove path info and just
+# have filename. Since, it was cauing the cmake
+# to throw "File name too long"
+set(device_files)
+foreach(filepath IN LISTS FMHA_FWD_GEN_BLOBS)
+    get_filename_component(filename ${filepath} NAME)
+    # Append the filename to the device_files list
+    list(APPEND device_files ${filename})
+endforeach()
+add_custom_target(generate_cpp_files DEPENDS ${FMHA_FWD_GEN_BLOBS})
+
+add_instance_library(device_mha_instance ${device_files})
+
+
+
+if (TARGET device_mha_instance)
+  add_dependencies(device_mha_instance generate_cpp_files)
+endif()
+
--- a/library/src/utility/convolution_parameter.cpp
+++ b/library/src/utility/convolution_parameter.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/host_utility/io.hpp"

@@ -20,6 +20,63 @@ ConvParam::ConvParam(ck::index_t n_dim,
                     const std::vector<ck::index_t>& dilations,
                     const std::vector<ck::index_t>& left_pads,
                     const std::vector<ck::index_t>& right_pads)
+    : num_dim_spatial_(static_cast<ck::long_index_t>(n_dim)),
+      G_(static_cast<ck::long_index_t>(group_count)),
+      N_(static_cast<ck::long_index_t>(n_batch)),
+      K_(static_cast<ck::long_index_t>(n_out_channels)),
+      C_(static_cast<ck::long_index_t>(n_in_channels)),
+      filter_spatial_lengths_(num_dim_spatial_),
+      input_spatial_lengths_(num_dim_spatial_),
+      output_spatial_lengths_(num_dim_spatial_),
+      conv_filter_strides_(num_dim_spatial_),
+      conv_filter_dilations_(num_dim_spatial_),
+      input_left_pads_(num_dim_spatial_),
+      input_right_pads_(num_dim_spatial_)
+{
+    if(static_cast<ck::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_right_pads_.size()) != num_dim_spatial_)
+    {
+        throw(
+            std::runtime_error("ConvParam::ConvParam: "
+                               "parameter size is different from number of declared dimensions!"));
+    }
+
+    for(ck::index_t i = 0; i < num_dim_spatial_; ++i)
+    {
+        filter_spatial_lengths_[i] = static_cast<ck::long_index_t>(filters_len[i]);
+        input_spatial_lengths_[i]  = static_cast<ck::long_index_t>(input_len[i]);
+        conv_filter_strides_[i]    = static_cast<ck::long_index_t>(strides[i]);
+        conv_filter_dilations_[i]  = static_cast<ck::long_index_t>(dilations[i]);
+        input_left_pads_[i]        = static_cast<ck::long_index_t>(left_pads[i]);
+        input_right_pads_[i]       = static_cast<ck::long_index_t>(right_pads[i]);
+
+        // XEff = (X - 1) * conv_dilation_w + 1;
+        // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+        const ck::long_index_t x_eff =
+            (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
+
+        output_spatial_lengths_[i] =
+            (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
+                conv_filter_strides_[i] +
+            1;
+    }
+}
+
+ConvParam::ConvParam(ck::long_index_t n_dim,
+                     ck::long_index_t group_count,
+                     ck::long_index_t n_batch,
+                     ck::long_index_t n_out_channels,
+                     ck::long_index_t n_in_channels,
+                     const std::vector<ck::long_index_t>& filters_len,
+                     const std::vector<ck::long_index_t>& input_len,
+                     const std::vector<ck::long_index_t>& strides,
+                     const std::vector<ck::long_index_t>& dilations,
+                     const std::vector<ck::long_index_t>& left_pads,
+                     const std::vector<ck::long_index_t>& right_pads)
    : num_dim_spatial_(n_dim),
      G_(group_count),
      N_(n_batch),
@@ -49,7 +106,8 @@ ConvParam::ConvParam(ck::index_t n_dim,
    {
        // XEff = (X - 1) * conv_dilation_w + 1;
        // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-        const ck::index_t x_eff = (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
+        const ck::long_index_t x_eff =
+            (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;

        output_spatial_lengths_[i] =
            (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
@@ -63,7 +121,7 @@ ConvParam::ConvParam()
 {
 }

-std::vector<ck::index_t> ConvParam::GetOutputSpatialLengths() const
+std::vector<ck::long_index_t> ConvParam::GetOutputSpatialLengths() const
 {
    return output_spatial_lengths_;
 }
@@ -97,46 +155,46 @@ std::string get_conv_param_parser_helper_msg()

 ck::utils::conv::ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[])
 {
-    const ck::index_t G = std::stoi(argv[arg_idx++]);
-    const ck::index_t N = std::stoi(argv[arg_idx++]);
-    const ck::index_t K = std::stoi(argv[arg_idx++]);
-    const ck::index_t C = std::stoi(argv[arg_idx++]);
-
-    std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial);
-    std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial);
-    std::vector<ck::index_t> conv_filter_strides(num_dim_spatial);
-    std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial);
-    std::vector<ck::index_t> input_left_pads(num_dim_spatial);
-    std::vector<ck::index_t> input_right_pads(num_dim_spatial);
+    const ck::long_index_t G = std::stol(argv[arg_idx++]);
+    const ck::long_index_t N = std::stol(argv[arg_idx++]);
+    const ck::long_index_t K = std::stol(argv[arg_idx++]);
+    const ck::long_index_t C = std::stol(argv[arg_idx++]);
+
+    std::vector<ck::long_index_t> filter_spatial_lengths(num_dim_spatial);
+    std::vector<ck::long_index_t> input_spatial_lengths(num_dim_spatial);
+    std::vector<ck::long_index_t> conv_filter_strides(num_dim_spatial);
+    std::vector<ck::long_index_t> conv_filter_dilations(num_dim_spatial);
+    std::vector<ck::long_index_t> input_left_pads(num_dim_spatial);
+    std::vector<ck::long_index_t> input_right_pads(num_dim_spatial);

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        filter_spatial_lengths[i] = std::stol(argv[arg_idx++]);
    }

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        input_spatial_lengths[i] = std::stol(argv[arg_idx++]);
    }

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        conv_filter_strides[i] = std::stol(argv[arg_idx++]);
    }

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        conv_filter_dilations[i] = std::stol(argv[arg_idx++]);
    }

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        input_left_pads[i] = std::stol(argv[arg_idx++]);
    }

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        input_right_pads[i] = std::stol(argv[arg_idx++]);
    }

    return ck::utils::conv::ConvParam{num_dim_spatial,