merge from public repo

667cd6ab · illsilin · 7d50244e · 365f39ae · 667cd6ab · 667cd6ab
Commit 667cd6ab authored Nov 05, 2024 by illsilin
20 changed files
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_int8_instances<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NGCHW,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NGKHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NGCHW,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NGKHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2,
+                                                                 NGCHW,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NGKHW,
+                                                                 ConvFwdDefault>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2,
+                                                                 NGCHW,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NGKHW,
+                                                                 ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwdDefault>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instanc
                                                              Tuple<>,
                                                              NHWGK,
                                                              ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
    add_device_operation_instances(
        instances,
        device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2,
@@ -47,6 +49,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instanc
                                                              Tuple<>,
                                                              NHWGK,
                                                              ConvFwd1x1S1P0>{});
+#endif
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance
                                                             Tuple<>,
                                                             NHWGK,
                                                             ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
    add_device_operation_instances(
        instances,
        device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2,
@@ -47,6 +49,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance
                                                             Tuple<>,
                                                             NHWGK,
                                                             ConvFwd1x1S1P0>{});
+#endif
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance
                                                             Tuple<>,
                                                             NHWGK,
                                                             ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
    add_device_operation_instances(
        instances,
        device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2,
@@ -47,6 +49,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance
                                                             Tuple<>,
                                                             NHWGK,
                                                             ConvFwd1x1S1P0>{});
+#endif
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp
@@ -30,6 +30,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instanc
                                                              Tuple<>,
                                                              NHWGK,
                                                              ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
    add_device_operation_instances(
        instances,
        device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2,
@@ -46,6 +48,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instanc
                                                              Tuple<>,
                                                              NHWGK,
                                                              ConvFwd1x1S1P0>{});
+#endif
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_inst
                                                              Tuple<>,
                                                              NDHWGK,
                                                              ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
    add_device_operation_instances(
        instances,
        device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3,
@@ -47,6 +49,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_inst
                                                              Tuple<>,
                                                              NDHWGK,
                                                              ConvFwd1x1S1P0>{});
+#endif
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_insta
                                                             Tuple<>,
                                                             NDHWGK,
                                                             ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
    add_device_operation_instances(
        instances,
        device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3,
@@ -47,6 +49,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_insta
                                                             Tuple<>,
                                                             NDHWGK,
                                                             ConvFwd1x1S1P0>{});
+#endif
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_insta
                                                             Tuple<>,
                                                             NDHWGK,
                                                             ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
    add_device_operation_instances(
        instances,
        device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3,
@@ -47,6 +49,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_insta
                                                             Tuple<>,
                                                             NDHWGK,
                                                             ConvFwd1x1S1P0>{});
+#endif
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
@@ -30,6 +30,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_inst
                                                              Tuple<>,
                                                              NDHWGK,
                                                              ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
    add_device_operation_instances(
        instances,
        device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3,
@@ -46,6 +48,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_inst
                                                              Tuple<>,
                                                              NDHWGK,
                                                              ConvFwd1x1S1P0>{});
+#endif
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
@@ -27,11 +27,6 @@ rocm_install(FILES ${MHA_HEADERS} DESTINATION include/ck_tile/ops)
 # headers for building lib
 file(COPY ${MHA_HEADERS} DESTINATION ${FMHA_CPP_FOLDER})

-# Delete the blob file if it exists to avoid append of old content.
-if(EXISTS ${FMHA_CPP_FOLDER}/blob_list.txt)
-    file(REMOVE ${FMHA_CPP_FOLDER}/blob_list.txt)
-endif()
-
 set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd")

 # generate a list of kernels, but not actually emit files at config stage

--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -101,7 +101,9 @@ int profile_gemm_universal(int argc, char* argv[])
    using F32  = float;
    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
-    using F8   = ck::f8_t;
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+    using F8 = ck::f8_t;
+#endif

    using Row = ck::tensor_layout::gemm::RowMajor;
    using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -162,6 +164,7 @@ int profile_gemm_universal(int argc, char* argv[])
    {
        return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
    }
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
@@ -178,6 +181,7 @@ int profile_gemm_universal(int argc, char* argv[])
    {
        return profile(F8{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
    }
+#endif
    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
@@ -194,6 +198,7 @@ int profile_gemm_universal(int argc, char* argv[])
    {
        return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
    }
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
@@ -202,6 +207,7 @@ int profile_gemm_universal(int argc, char* argv[])
    {
        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
    }
+#endif
    else
    {
        std::cout << "this data_type & layout is not implemented" << std::endl;

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -64,11 +64,11 @@ function(add_test_executable TEST_NAME)
    #only continue if there are some source files left on the list
    if(ARGN)
        if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(ARGN MATCHES "_wmma")
-             list(REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})
@@ -141,11 +141,11 @@ function(add_gtest_executable TEST_NAME)
    #only continue if there are some source files left on the list
    if(ARGN)
        if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(ARGN MATCHES "_wmma")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})

--- a/test/gemm_universal/test_gemm_universal_xdl.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl.cpp
@@ -56,7 +56,7 @@ class TestGemmUniversal_KM_NK
 using KernelTypes_MK_KN = ::testing::Types<
    //         ADataType, BDataType, ComputeDataType, CDataType
    std::tuple<      F16,       F16,             F16,     F16>,
-#if (defined CK_ENABLE_FP8)
+#if defined(CK_ENABLE_FP8) && defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
    std::tuple<      F16,        F8,             F16,     F16>,
    std::tuple<       F8,       F16,             F16,     F16>,
    std::tuple<       F8,        F8,              F8,    BF16>,
@@ -66,7 +66,7 @@ using KernelTypes_MK_KN = ::testing::Types<
 using KernelTypes_MK_NK = ::testing::Types<
    //         ADataType, BDataType, ComputeDataType, CDataType
    std::tuple<      F16,       F16,             F16,     F16>,
-#if (defined CK_ENABLE_FP8)
+#if defined(CK_ENABLE_FP8) && defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
    std::tuple<      F16,        F8,             F16,     F16>,
    std::tuple<       F8,       F16,             F16,     F16>,
    std::tuple<       F8,        F8,              F8,    BF16>,

--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -58,13 +58,13 @@ using KernelTypes1d = ::testing::Types<std::tuple<float, GNWC, GKXC, GNWK>,
 using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWC, GKYXC, GNHWK>,
                                       std::tuple<ck::half_t, GNHWC, GKYXC, GNHWK>,
                                       std::tuple<ck::bhalf_t, GNHWC, GKYXC, GNHWK>,
-                                       std::tuple<int8_t, GNHWC, GKYXC, GNHWK>,
                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,
                                       std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
                                       std::tuple<int8_t, NHWGC, GKYXC, NHWGK>,
                                       std::tuple<float, NGCHW, GKYXC, NGKHW>,
-                                       std::tuple<ck::half_t, NGCHW, GKYXC, NGKHW>>;
+                                       std::tuple<ck::half_t, NGCHW, GKYXC, NGKHW>,
+                                       std::tuple<int8_t, NGCHW, GKYXC, NGKHW>>;

 using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK>,
                                       std::tuple<ck::half_t, GNDHWC, GKZYXC, GNDHWK>,