Clean DTYPES conditions in CMake (#974)

* Add a condition to build fp8 instances * simplified buffer_load/store * add bfp8/fp8 * fixed * remove all f8/bf8 condition include folder * fixed cmake conditions * fixed DTYPES=fp16/bfp16 * fix * fixed buffer_load * fixed buffer_store * fix * clean example cmake files * fixed ci * fixed cit --------- Co-authored-by: Rostyslav Geyyer <rosty.geyyer@amd.com> Co-authored-by: Jing Zhang <jizha@amd.com>

Clean DTYPES conditions in CMake (#974)
* Add a condition to build fp8 instances * simplified buffer_load/store * add bfp8/fp8 * fixed * remove all f8/bf8 condition include folder * fixed cmake conditions * fixed DTYPES=fp16/bfp16 * fix * fixed buffer_load * fixed buffer_store * fix * clean example cmake files * fixed ci * fixed cit --------- Co-authored-by: Rostyslav Geyyer <rosty.geyyer@amd.com> Co-authored-by: Jing Zhang <jizha@amd.com>
bf435140 · zjing14 · GitHub · 1cc36ba5 · bf435140 · bf435140
Unverified Commit bf435140 authored Oct 18, 2023 by zjing14 Committed by GitHub Oct 18, 2023
17 changed files
--- a/include/ck/utility/f8_utils.hpp
+++ b/include/ck/utility/f8_utils.hpp
@@ -6,8 +6,6 @@
 #include "ck/utility/data_type.hpp"
 // these conversions are disabled if native conversions available
-#if !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
-#if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
 namespace ck {
 // fp8 rounding modes
@@ -244,5 +242,3 @@ __host__ __device__ Y cast_from_f8(X x)
 }
 } // namespace ck::utils
-#endif // #if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
-#endif // #if !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -95,7 +95,6 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
    return type_convert<bhalf_t>(x_fp32);
 }
-#if defined CK_ENABLE_FP8
 // convert fp32 to fp8
 template <>
 inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
@@ -173,9 +172,7 @@ inline __host__ __device__ half_t type_convert<half_t, f8_t>(f8_t x)
    return type_convert<half_t>(type_convert<float>(x));
 #endif
 }
-#endif
-#if defined CK_ENABLE_BF8
 // convert fp32 to bf8
 template <>
 inline __host__ __device__ bf8_t type_convert<bf8_t, float>(float x)
@@ -253,7 +250,6 @@ inline __host__ __device__ half_t type_convert<half_t, bf8_t>(bf8_t x)
    return type_convert<half_t>(type_convert<float>(x));
 #endif
 }
-#endif
 // Declare a template function for bf16 conversion using RTN
 template <typename Y, typename X>
@@ -316,7 +312,6 @@ inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, half_t>(h
 template <typename Y, typename X>
 __host__ __device__ constexpr Y f8_convert_sr(X x);
-#if defined CK_ENABLE_FP8
 // convert fp32 to fp8 with stochastic rounding
 template <>
 inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
@@ -365,9 +360,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
    return f8_convert_sr<f8_t>(type_convert<float>(x));
 #endif
 }
-#endif
-#if defined CK_ENABLE_BF8
 // convert fp32 to bf8 with stochastic rounding
 template <>
 inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
@@ -417,6 +410,5 @@ inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, half_t>(half_t x)
    return f8_convert_sr<bf8_t>(type_convert<float>(x));
 #endif
 }
-#endif
 } // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -20,12 +20,8 @@ using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using I8   = int8_t;
 using I32  = int32_t;
-#if defined CK_ENABLE_FP8
+using F8   = ck::f8_t;
-using F8 = ck::f8_t;
+using BF8  = ck::bf8_t;
-#endif
-#if defined CK_ENABLE_BF8
-using BF8 = ck::bf8_t;
-#endif
 using Empty_Tuple = ck::Tuple<>;

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
@@ -240,11 +240,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
        if constexpr(NumDimSpatial == 1 && is_same_v<InLayout, NWC> && is_same_v<WeiLayout, KXC> &&
                     is_same_v<OutLayout, NWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)
@@ -267,17 +269,23 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            }
 #endif
        }
-        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
-                          is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
+                     is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
-#ifdef DL_KERNELS
+            }
-                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
 #endif
+#if defined(DL_KERNELS) && defined(CK_ENABLE_FP32)
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)
@@ -306,14 +314,16 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            }
 #endif
        }
-        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWC> &&
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWC> &&
-                          is_same_v<WeiLayout, KZYXC> && is_same_v<OutLayout, NDHWK>)
+                     is_same_v<WeiLayout, KZYXC> && is_same_v<OutLayout, NDHWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
@@ -98,30 +98,31 @@ struct DeviceOperationInstanceFactory<
        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
                     is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                              is_same_v<OutDataType, half_t>)
+                         is_same_v<OutDataType, half_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
                add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_BF16
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
-                              is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
-                              is_same_v<OutDataType, ck::bhalf_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_INT8
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                              is_same_v<OutDataType, int8_t>)
+                         is_same_v<OutDataType, int8_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
            }

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
@@ -155,7 +155,7 @@ struct DeviceOperationInstanceFactory<
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 #ifdef CK_ENABLE_FP32
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
-                     is_same_v<CDataType, float>)
+                     is_same_v<CDataType, float> && is_same_v<ComputeType, float>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -180,8 +180,8 @@ struct DeviceOperationInstanceFactory<
        }
 #endif
 #ifdef CK_ENABLE_FP16
-        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
-                          is_same_v<CDataType, half_t> && is_same_v<ComputeType, half_t>)
+                     is_same_v<CDataType, half_t> && is_same_v<ComputeType, half_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -206,8 +206,8 @@ struct DeviceOperationInstanceFactory<
        }
 #endif
 #if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8))
-        else if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, half_t> &&
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, half_t> &&
-                          is_same_v<CDataType, half_t>)
+                     is_same_v<CDataType, half_t> && is_same_v<ComputeType, half_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -230,8 +230,8 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_xdl_splitk_f8_f16_f16_km_nk_mn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
-                          is_same_v<CDataType, half_t>)
+                     is_same_v<CDataType, half_t> && is_same_v<ComputeType, half_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -627,8 +627,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t>)
+                             is_same_v<OutDataType, half_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instances(op_ptrs);
@@ -637,9 +637,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances(
@@ -650,8 +649,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
            }
-            else if constexpr(is_same_v<InLayout, NWGC> && is_same_v<WeiLayout, GKXC> &&
+            if constexpr(is_same_v<InLayout, NWGC> && is_same_v<WeiLayout, GKXC> &&
-                              is_same_v<OutLayout, NWGK>)
+                         is_same_v<OutLayout, NWGK>)
            {
 #ifdef DL_KERNELS
 #ifdef CK_ENABLE_FP32
@@ -662,16 +661,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t>)
+                             is_same_v<OutDataType, half_t>)
                {
                    add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instances(op_ptrs);
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
                    add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instances(
                        op_ptrs);
@@ -680,7 +678,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
            }
        }
-        else if constexpr(NumDimSpatial == 2)
+        if constexpr(NumDimSpatial == 2)
        {
            if constexpr(is_same_v<InLayout, GNHWC> && is_same_v<WeiLayout, GKYXC> &&
                         is_same_v<OutLayout, GNHWK>)
@@ -698,8 +696,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t>)
+                             is_same_v<OutDataType, half_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instances(
@@ -710,9 +708,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances(
@@ -723,8 +720,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
            }
-            else if constexpr(is_same_v<InLayout, NHWGC> && is_same_v<WeiLayout, GKYXC> &&
+            if constexpr(is_same_v<InLayout, NHWGC> && is_same_v<WeiLayout, GKYXC> &&
-                              is_same_v<OutLayout, NHWGK>)
+                         is_same_v<OutLayout, NHWGK>)
            {
 #ifdef CK_ENABLE_FP32
                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
@@ -739,8 +736,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t>)
+                             is_same_v<OutDataType, half_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instances(
@@ -751,9 +748,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances(
@@ -765,7 +761,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
            }
        }
-        else if constexpr(NumDimSpatial == 3)
+        if constexpr(NumDimSpatial == 3)
        {
            if constexpr(is_same_v<InLayout, GNDHWC> && is_same_v<WeiLayout, GKZYXC> &&
                         is_same_v<OutLayout, GNDHWK>)
@@ -783,8 +779,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t>)
+                             is_same_v<OutDataType, half_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instances(
@@ -799,9 +795,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances(
@@ -822,8 +817,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
            }
-            else if constexpr(is_same_v<InLayout, NDHWGC> && is_same_v<WeiLayout, GKZYXC> &&
+            if constexpr(is_same_v<InLayout, NDHWGC> && is_same_v<WeiLayout, GKZYXC> &&
-                              is_same_v<OutLayout, NDHWGK>)
+                         is_same_v<OutLayout, NDHWGK>)
            {
 #ifdef CK_ENABLE_FP32
                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
@@ -838,10 +833,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
-                                  is_same_v<ComputeTypeA, half_t> &&
+                             is_same_v<ComputeTypeB, half_t>)
-                                  is_same_v<ComputeTypeB, half_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
@@ -856,9 +850,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
@@ -879,9 +872,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, bf8_t> &&
-                                  is_same_v<ComputeTypeA, bf8_t> && is_same_v<ComputeTypeB, f8_t>)
+                             is_same_v<ComputeTypeB, f8_t>)
                {
                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
                        op_ptrs);

--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -230,7 +230,6 @@ check_err(const Range& out,
    return res;
 }
-#if defined CK_ENABLE_FP8
 template <typename Range, typename RefRange>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                  std::is_same_v<ranges::range_value_t<Range>, f8_t>),
@@ -275,9 +274,7 @@ check_err(const Range& out,
    }
    return res;
 }
-#endif
-#if defined CK_ENABLE_BF8
 template <typename Range, typename RefRange>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                  std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
@@ -322,7 +319,6 @@ check_err(const Range& out,
    }
    return res;
 }
-#endif
 } // namespace utils
 } // namespace ck
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -2,44 +2,44 @@ function(add_instance_library INSTANCE_NAME)
    message("adding instance ${INSTANCE_NAME}")
    set(result 1)
    if(DEFINED DTYPES)
-      foreach(source IN LISTS ARGN)
+        foreach(source IN LISTS ARGN)
-        set(test 0)
+            set(test 0)
-        foreach(type IN LISTS DTYPES)
+            foreach(type IN LISTS DTYPES)
                if(type MATCHES "fp16")
-                        set(type1 "_f16")
+                    set(type1 "_f16")
                elseif(type MATCHES "fp32")
-                        set(type1 "_f32")
+                    set(type1 "_f32")
                elseif(type MATCHES "fp8")
-                        set(type1 "_f8")
+                    set(type1 "_f8")
                elseif(type MATCHES "bf16")
-                        set(type1 "_b16")
+                    set(type1 "_b16")
                elseif(type MATCHES "fp64")
-                        set(type1 "_f64")
+                    set(type1 "_f64")
                elseif(type MATCHES "int8")
-                        set(type1 "_i8")
+                    set(type1 "_i8")
                endif()
                #make an exception for reduction kernels
-                if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}" OR "${source}" MATCHES "device_reduce_instance")
+                if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}" OR "${source}" MATCHES "device_reduce_instance" OR ${source} MATCHES "device_image_to_column")
-                        #if filename matches any selected type, exit type loop and do no exclude the file from the list
+                    #if filename matches any selected type, exit type loop and do no exclude the file from the list
-                        set(test 0)
+                    set(test 0)
-                        break()
+                    break()
                elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
-                  source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
+                    source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
-                  NOT(source MATCHES type OR source MATCHES type1))
+                    NOT(source MATCHES type OR source MATCHES type1))
-                        #if filename contains a type which doesn't match any selected type, mark it for removal
+                    #if filename contains a type which doesn't match any selected type, mark it for removal
-                        set(test 1)
+                    set(test 1)
                endif()
-        endforeach()
+            endforeach()
-        if(test EQUAL 1)
+            if(test EQUAL 1)
                message("removing instance ${source} ")
                list(REMOVE_ITEM ARGN "${source}")
-        endif()
+            endif()
-      endforeach()
+        endforeach()
    endif()
    foreach(source IN LISTS ARGN)
        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
-                message("removing dl instance ${source} ")
+            message("removing dl instance ${source} ")
-                list(REMOVE_ITEM ARGN "${source}")
+            list(REMOVE_ITEM ARGN "${source}")
        endif()
    endforeach()
    #only continue if there are some source files left on the list
@@ -49,8 +49,10 @@ function(add_instance_library INSTANCE_NAME)
        set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
        clang_tidy_check(${INSTANCE_NAME})
        set(result 0)
+        message("add_instance_library ${INSTANCE_NAME}")
+    else()
+        message("skip_instance_libary ${INSTANCE_NAME}")
    endif()
-    #message("add_instance_library returns ${result}")
    set(result ${result} PARENT_SCOPE)
 endfunction(add_instance_library INSTANCE_NAME)
@@ -58,65 +60,70 @@ endfunction(add_instance_library INSTANCE_NAME)
 file(GLOB dir_list LIST_DIRECTORIES true *)
 set(CK_DEVICE_INSTANCES)
 FOREACH(subdir_path ${dir_list})
-set(target_dir)
+    set(target_dir)
-IF(IS_DIRECTORY "${subdir_path}")
+    IF(IS_DIRECTORY "${subdir_path}")
-    set(cmake_instance)
+        set(cmake_instance)
-    file(READ "${subdir_path}/CMakeLists.txt" cmake_instance)
+        file(READ "${subdir_path}/CMakeLists.txt" cmake_instance)
-    set(add_inst 0)
+        set(add_inst 0)
-    if(("${cmake_instance}" MATCHES "_fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
+        if(("${cmake_instance}" MATCHES "_fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
            message("fp8 instance found!")
            set(add_inst 1)
-    endif()
+        endif()
-    if(("${cmake_instance}" MATCHES "_fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
+        if(("${cmake_instance}" MATCHES "_fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
            message("fp16 instance found!")
            set(add_inst 1)
-    endif()
+        endif()
-    if(("${cmake_instance}" MATCHES "_fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
+        if(("${cmake_instance}" MATCHES "_fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
            message("fp32 instance found!")
            set(add_inst 1)
-    endif()
+        endif()
-    if(("${cmake_instance}" MATCHES "_fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
+        if(("${cmake_instance}" MATCHES "_fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
            message("fp64 instance found!")
            set(add_inst 1)
-    endif()
+        endif()
-    if("${cmake_instance}" MATCHES "_bf16" AND DTYPES MATCHES "bf16")
+        if("${cmake_instance}" MATCHES "_bf16" AND DTYPES MATCHES "bf16")
            message("bf16 instance found!")
            set(add_inst 1)
-    endif()
+        endif()
-    if(("${cmake_instance}" MATCHES "_int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
+        if(("${cmake_instance}" MATCHES "_int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
            message("int8 instance found!")
            set(add_inst 1)
-    endif()
+        endif()
-    if(NOT "${cmake_instance}" MATCHES "_fp8" OR
+        if(NOT ("${cmake_instance}" MATCHES "_fp8" OR
-                   NOT "${cmake_instance}" MATCHES "_f8" OR
+                "${cmake_instance}" MATCHES "_f8" OR
-                   NOT "${cmake_instance}" MATCHES "_fp16" OR
+                "${cmake_instance}" MATCHES "_fp16" OR
-                   NOT "${cmake_instance}" MATCHES "_f16" OR
+                "${cmake_instance}" MATCHES "_f16" OR
-                   NOT "${cmake_instance}" MATCHES "_fp32" OR
+                "${cmake_instance}" MATCHES "_fp32" OR
-                   NOT "${cmake_instance}" MATCHES "_f32" OR
+                "${cmake_instance}" MATCHES "_f32" OR
-                   NOT "${cmake_instance}" MATCHES "_fp64" OR
+                "${cmake_instance}" MATCHES "_fp64" OR
-                   NOT "${cmake_instance}" MATCHES "_f64" OR
+                "${cmake_instance}" MATCHES "_f64" OR
-                   NOT "${cmake_instance}" MATCHES "_bf16" OR
+                "${cmake_instance}" MATCHES "_bf16" OR
-                   NOT "${cmake_instance}" MATCHES "_int8" OR
+                "${cmake_instance}" MATCHES "_int8" OR
-                   NOT "${cmake_instance}" MATCHES "_i8" OR
+                "${cmake_instance}" MATCHES "_i8" OR
-                   NOT "${cmake_instance}" MATCHES "_int4" OR
+                "${cmake_instance}" MATCHES "_int4"))
-                   NOT DEFINED DTYPES)
+            message("instance should be built for all types!")
-             message("instance should be built for all types!")
+            set(add_inst 1)
-             set(add_inst 1)
+        endif()
-     endif()
+        if(NOT DEFINED DTYPES)
-    if("${cmake_instance}" MATCHES "quantization" AND DEFINED DTYPES AND NOT DTYPES MATCHES "int8")
+            set(add_inst 1)
-           message("quantization instances will not be built!")
+        endif()
-           set(add_inst 0)
+        if(("${cmake_instance}" MATCHES "quantization") AND (DEFINED DTYPES) AND (NOT DTYPES MATCHES "int8"))
-    endif()
+            message("quantization instances will not be built!")
-    if("${cmake_instance}" MATCHES "ONLY DL_KERNELS" AND NOT DEFINED DL_KERNELS)
-	    message("Found only dl instances, but DL_KERNELS is not set. Skipping.")
            set(add_inst 0)
-    endif()
+        endif()
-    if(add_inst EQUAL 1)
+        if(("${cmake_instance}" MATCHES "ONLY DL_KERNELS") AND (NOT DEFINED DL_KERNELS))
-      get_filename_component(target_dir ${subdir_path} NAME)
+            message("Found only dl instances, but DL_KERNELS is not set. Skipping.")
-      add_subdirectory(${target_dir})
+            set(add_inst 0)
-      list(APPEND CK_DEVICE_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+        endif()
-    endif()
+        if((add_inst EQUAL 1))
-ENDIF()
+            get_filename_component(target_dir ${subdir_path} NAME)
+            add_subdirectory(${target_dir})
+            list(APPEND CK_DEVICE_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+            message("add_instance_directory ${subdir_path}")
+        else()
+            message("skip_instance_directory ${subdir_path}")
+        endif()
+    ENDIF()
 ENDFOREACH()
 add_library(device_operations STATIC ${CK_DEVICE_INSTANCES})
@@ -158,11 +165,11 @@ target_compile_options(device_operations PRIVATE
 # install(TARGETS device_operations LIBRARY DESTINATION lib)
 rocm_install(TARGETS device_operations
-        EXPORT device_operationsTargets)
+    EXPORT device_operationsTargets)
 rocm_install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
 rocm_install(EXPORT device_operationsTargets
-        FILE composable_kerneldevice_operationsTargets.cmake
+    FILE composable_kerneldevice_operationsTargets.cmake
-        NAMESPACE composable_kernel::
+    NAMESPACE composable_kernel::
-        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
 )
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
-add_instance_library(device_grouped_conv3d_bwd_data_instance
+set(GROUPED_CONV3D_BWD_DATA
   xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
   xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
   xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
-   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp
   wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
   wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
   wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp
@@ -13,5 +12,11 @@ add_instance_library(device_grouped_conv3d_bwd_data_instance
   wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp
-   wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp
+   wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp)
-)
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
+   list(APPEND GROUPED_CONV3D_BWD_DATA
+      xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp)
+endif()
+add_instance_library(device_grouped_conv3d_bwd_data_instance ${GROUPED_CONV3D_BWD_DATA})
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
-add_instance_library(device_grouped_conv3d_fwd_instance
+set(GROUPED_CONV3D_FWD
   xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
-   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
+   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp)
-)
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
+    list(APPEND GROUPED_CONV3D_FWD
+      xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp)
+endif()
+add_instance_library(device_grouped_conv3d_fwd_instance ${GROUPED_CONV3D_FWD})
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt
 set(GROUPED_GEMM_FIXED_NK_INSTANCES)
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+list(APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp
-  list(APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp)
+                                            device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp
-  list(APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp)
+                                            device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp
-endif()
+                                            device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp
+                                            device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp
-if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
+                                            device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp)
-  list(APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp)
-  list(APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp)
-endif()
-if((DTYPES MATCHES "int8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
-  list(APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp)
-  list(APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp)
-endif()
 add_instance_library(device_grouped_gemm_fixed_nk_instance ${GROUPED_GEMM_FIXED_NK_INSTANCES})
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -25,8 +25,6 @@ set(PROFILER_SOURCES
    profile_batchnorm_fwd.cpp
    profile_batchnorm_bwd.cpp
    profile_batchnorm_infer.cpp
-    profile_contraction_bilinear.cpp
-    profile_contraction_scale.cpp
    profile_grouped_conv_bwd_data.cpp
    profile_conv_tensor_rearrange.cpp
 )
@@ -46,6 +44,11 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
  list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
 endif()
+if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
+  list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
+  list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
+endif()
 set(PROFILER_EXECUTABLE ckProfiler)
 add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
@@ -76,8 +79,6 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instan
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
@@ -85,9 +86,18 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_d
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
+if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
+endif()
 if(DL_KERNELS)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
 endif()
 if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)

--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -86,12 +86,8 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
    using F32  = float;
    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
-#ifdef CK_ENABLE_FP8
+    using F8   = ck::f8_t;
-    using F8 = ck::f8_t;
+    using BF8  = ck::bf8_t;
-#endif
-#ifdef CK_ENABLE_BF8
-    using BF8 = ck::bf8_t;
-#endif
    using namespace ck::tensor_layout::convolution;
@@ -141,59 +137,59 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
        {
            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
-        else if(data_type == ConvDataType::F16_F16_F16)
+        if(data_type == ConvDataType::F16_F16_F16)
        {
            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
        }
-        else if(data_type == ConvDataType::BF16_F32_BF16)
+        if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
            return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
+    if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
-        else if(data_type == ConvDataType::F16_F16_F16)
+        if(data_type == ConvDataType::F16_F16_F16)
        {
            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
        }
-        else if(data_type == ConvDataType::BF16_F32_BF16)
+        if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
-        else if(data_type == ConvDataType::F16_F16_F16)
+        if(data_type == ConvDataType::F16_F16_F16)
        {
            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
        }
-        else if(data_type == ConvDataType::BF16_F32_BF16)
+        if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
+    if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
-        else if(data_type == ConvDataType::F16_F16_F16)
+        if(data_type == ConvDataType::F16_F16_F16)
        {
            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
        }
-        else if(data_type == ConvDataType::BF16_F32_BF16)
+        if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
@@ -204,22 +200,22 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
                I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{});
        }
    }
-    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
-        else if(data_type == ConvDataType::F16_F16_F16)
+        if(data_type == ConvDataType::F16_F16_F16)
        {
            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
        }
-        else if(data_type == ConvDataType::BF16_F32_BF16)
+        if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
        }
-        else if(data_type == ConvDataType::F16_F16_F16_BF8_F8)
+        if(data_type == ConvDataType::F16_F16_F16_BF8_F8)
        {
            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{});
        }

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -11,40 +11,40 @@ function(add_test_executable TEST_NAME)
    message("adding test ${TEST_NAME}")
    set(result 1)
    if(DEFINED DTYPES)
-    foreach(source IN LISTS ARGN)
+        foreach(source IN LISTS ARGN)
-        set(test 0)
+            set(test 0)
-        foreach(type IN LISTS DTYPES)
+            foreach(type IN LISTS DTYPES)
-            if(type MATCHES "fp16")
+                if(type MATCHES "fp16")
-                set(type1 "_f16")
+                    set(type1 "_f16")
-            elseif(type MATCHES "fp32")
+                elseif(type MATCHES "fp32")
-                set(type1 "_f32")
+                    set(type1 "_f32")
-            elseif(type MATCHES "fp8")
+                elseif(type MATCHES "fp8")
-                set(type1 "_f8")
+                    set(type1 "_f8")
-            elseif(type MATCHES "bf16")
+                elseif(type MATCHES "bf16")
-                set(type1 "_b16")
+                    set(type1 "_b16")
-            elseif(type MATCHES "fp64")
+                elseif(type MATCHES "fp64")
-                set(type1 "_f64")
+                    set(type1 "_f64")
-            elseif(type MATCHES "int8")
+                elseif(type MATCHES "int8")
-                set(type1 "_i8")
+                    set(type1 "_i8")
-            endif()
+                endif()
-            if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
+                if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
-                #if filename matches any selected type, exit type loop and do no exclude the file from the list
+                    #if filename matches any selected type, exit type loop and do no exclude the file from the list
-                set(test 0)
+                    set(test 0)
-                break()
+                    break()
-            elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
+                elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
-                source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
+                    source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
-                NOT(source MATCHES type OR source MATCHES type1))
+                    NOT(source MATCHES type OR source MATCHES type1))
                    #if filename contains a type which doesn't match any selected type, mark it for removal
                    set(test 1)
+                endif()
+            endforeach()
+            if(test EQUAL 1)
+                message("removing test ${source} ")
+                list(REMOVE_ITEM ARGN "${source}")
            endif()
        endforeach()
-        if(test EQUAL 1)
+    endif()
-            message("removing test ${source} ")
+    foreach(source IN LISTS ARGN)
-            list(REMOVE_ITEM ARGN "${source}")
-        endif()
-      endforeach()
-      endif()
-      foreach(source IN LISTS ARGN)
        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
            message("removing dl test ${source} ")
            list(REMOVE_ITEM ARGN "${source}")
@@ -70,38 +70,38 @@ function(add_gtest_executable TEST_NAME)
    message("adding gtest ${TEST_NAME}")
    set(result 1)
    if(DEFINED DTYPES)
-    foreach(source IN LISTS ARGN)
+        foreach(source IN LISTS ARGN)
-        set(test 0)
+            set(test 0)
-        foreach(type IN LISTS DTYPES)
+            foreach(type IN LISTS DTYPES)
-            if(type MATCHES "fp16")
+                if(type MATCHES "fp16")
-                set(type1 "_f16")
+                    set(type1 "_f16")
-            elseif(type MATCHES "fp32")
+                elseif(type MATCHES "fp32")
-                set(type1 "_f32")
+                    set(type1 "_f32")
-            elseif(type MATCHES "fp8")
+                elseif(type MATCHES "fp8")
-                set(type1 "_f8")
+                    set(type1 "_f8")
-            elseif(type MATCHES "bf16")
+                elseif(type MATCHES "bf16")
-                set(type1 "_b16")
+                    set(type1 "_b16")
-            elseif(type MATCHES "fp64")
+                elseif(type MATCHES "fp64")
-                set(type1 "_f64")
+                    set(type1 "_f64")
-            elseif(type MATCHES "int8")
+                elseif(type MATCHES "int8")
-                set(type1 "_i8")
+                    set(type1 "_i8")
-            endif()
+                endif()
-            if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
+                if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
-                #if filename matches any selected type, exit type loop and do no exclude the file from the list
+                    #if filename matches any selected type, exit type loop and do no exclude the file from the list
-                set(test 0)
+                    set(test 0)
-                break()
+                    break()
-            elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
+                elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
-                source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
+                    source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
-                NOT(source MATCHES type OR source MATCHES type1))
+                    NOT(source MATCHES type OR source MATCHES type1))
                    #if filename contains a type which doesn't match any selected type, mark it for removal
                    set(test 1)
+                endif()
+            endforeach()
+            if(test EQUAL 1)
+                message("removing gtest ${source} ")
+                list(REMOVE_ITEM ARGN "${source}")
            endif()
        endforeach()
-        if(test EQUAL 1)
-            message("removing gtest ${source} ")
-            list(REMOVE_ITEM ARGN "${source}")
-        endif()
-    endforeach()
    endif()
    foreach(source IN LISTS ARGN)
        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")

--- a/test/contraction/CMakeLists.txt
+++ b/test/contraction/CMakeLists.txt
-add_gtest_executable(test_contraction test_contraction.cpp)
-target_link_libraries(test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
+    if(gpu IN_LIST gpu_list AND target EQUAL 0)
-    add_gtest_executable(test_contraction_interface test_contraction_interface.cpp)
+        if((DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64") OR NOT DEFINED DTYPES)
-    target_link_libraries(test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
+            add_gtest_executable(test_contraction test_contraction.cpp)
-   set(target 1)
+            target_link_libraries(test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
- endif()
+            add_gtest_executable(test_contraction_interface test_contraction_interface.cpp)
+            target_link_libraries(test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
+            set(target 1)
+        endif()
+    endif()
 endforeach()
--- a/test/conv_tensor_rearrange/CMakeLists.txt
+++ b/test/conv_tensor_rearrange/CMakeLists.txt
 add_gtest_executable(test_conv_tensor_rearrange test_conv_tensor_rearrange.cpp)
 target_link_libraries(test_conv_tensor_rearrange PRIVATE utility device_image_to_column_instance device_column_to_image_instance)
 add_gtest_executable(test_conv_tensor_rearrange_interface test_conv_tensor_rearrange_interface.cpp)
 target_link_libraries(test_conv_tensor_rearrange_interface PRIVATE utility)