Merge branch 'jizhan/enable_bf16_atomic_add' of...

Merge branch 'jizhan/enable_bf16_atomic_add' of github.com:zjing14/composable_kernel into jizhan/enable_bf16_atomic_add

Merge branch 'jizhan/enable_bf16_atomic_add' of...
Merge branch 'jizhan/enable_bf16_atomic_add' of github.com:zjing14/composable_kernel into jizhan/enable_bf16_atomic_add
f6fdb74b · Jing Zhang · 79ac8751 · 73de444f · f6fdb74b · f6fdb74b
Commit f6fdb74b authored Aug 01, 2024 by Jing Zhang
3 changed files
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/test/smfmac_op/smfmac_op_xdl.cpp
+++ b/test/smfmac_op/smfmac_op_xdl.cpp
@@ -13,6 +13,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "test/smfmac_op/smfmac_op_util.hpp"
+#include "ck/host_utility/device_prop.hpp"

 using BF16        = ck::bhalf_t;
 using F16         = ck::half_t;
@@ -38,40 +39,43 @@ class TestSmfmac : public ::testing::Test

    void Run()
    {
-        bool pass                     = true;
-        constexpr auto matmul_default = ck::smfmac_op_util::matmul<Src1Type,
-                                                                   Src1VecSize,
-                                                                   Src2Type,
-                                                                   Src2VecSize,
-                                                                   GPUAccType,
-                                                                   AccVecSize,
-                                                                   DstType,
-                                                                   M,
-                                                                   N,
-                                                                   K>;
+        bool pass = true;
+        if(ck::get_device_name() == "gfx942")
+        {
+            constexpr auto matmul_default = ck::smfmac_op_util::matmul<Src1Type,
+                                                                       Src1VecSize,
+                                                                       Src2Type,
+                                                                       Src2VecSize,
+                                                                       GPUAccType,
+                                                                       AccVecSize,
+                                                                       DstType,
+                                                                       M,
+                                                                       N,
+                                                                       K>;

-        constexpr auto smfmac_kernel_container = std::make_tuple(matmul_default);
-
-        ck::static_for<0, std::tuple_size_v<decltype(smfmac_kernel_container)>, 1>{}([&](auto i) {
-            pass &= ck::smfmac_op_util::TestSmfmac<
-                std::tuple_element_t<i.value, decltype(smfmac_kernel_container)>,
-                Src1Type,
-                Src2Type,
-                DstType,
-                GPUAccType,
-                CPUAccType,
-                decltype(Row{}),
-                decltype(Row{}),
-                decltype(Row{}),
-                PassThrough,
-                PassThrough,
-                PassThrough,
-                AccVecSize,
-                M,
-                N,
-                K>{}(std::get<ck::Number<i>{}>(smfmac_kernel_container));
-        });
+            constexpr auto smfmac_kernel_container = std::make_tuple(matmul_default);

+            ck::static_for<0, std::tuple_size_v<decltype(smfmac_kernel_container)>, 1>{}(
+                [&](auto i) {
+                    pass &= ck::smfmac_op_util::TestSmfmac<
+                        std::tuple_element_t<i.value, decltype(smfmac_kernel_container)>,
+                        Src1Type,
+                        Src2Type,
+                        DstType,
+                        GPUAccType,
+                        CPUAccType,
+                        decltype(Row{}),
+                        decltype(Row{}),
+                        decltype(Row{}),
+                        PassThrough,
+                        PassThrough,
+                        PassThrough,
+                        AccVecSize,
+                        M,
+                        N,
+                        K>{}(std::get<ck::Number<i>{}>(smfmac_kernel_container));
+                });
+        }
        EXPECT_TRUE(pass);
    }
 };