Merge branch 'develop' of...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_fp16_wmma_conv_instance

Merge branch 'develop' of...
Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_fp16_wmma_conv_instance
e57bd240 · aska-0096 · 4b70d68e · 37a8c1f7 · e57bd240 · e57bd240
Commit e57bd240 authored Sep 07, 2023 by aska-0096
20 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef CK_ENABLE_BF16
 void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
@@ -36,7 +36,8 @@ void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef CK_ENABLE_FP16
 void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -56,7 +57,8 @@ void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef CK_ENABLE_FP32
 void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
@@ -76,7 +78,8 @@ void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef CK_ENABLE_INT8
 void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<DeviceBatchedGemm<Col,
                                                  Row,
@@ -120,7 +123,7 @@ void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
 template <typename ALayout,
          typename BLayout,
          typename CLayout,
@@ -151,7 +154,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef CK_ENABLE_FP32
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
                     is_same_v<CDataType, float>)
        {
@@ -176,8 +179,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+#endif
-                          is_same_v<CDataType, half_t>)
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<CDataType, half_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -200,8 +205,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+#endif
-                          is_same_v<CDataType, bhalf_t>)
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                     is_same_v<CDataType, bhalf_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -224,8 +231,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+#endif
-                          is_same_v<CDataType, int8_t>)
+#ifdef CK_ENABLE_INT8
+        if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+                     is_same_v<CDataType, int8_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -248,7 +257,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
@@ -14,7 +14,7 @@
 using CDE0ElementOp = ck::tensor_operation::element_wise::AddRelu;
 using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
+#ifdef CK_ENABLE_FP16
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -137,3 +137,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
@@ -13,7 +13,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef CK_ENABLE_FP16
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -91,3 +91,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef CK_ENABLE_FP16
 void add_device_batched_gemm_bias_masking_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemmSoftmaxGemmPermute<2,
@@ -58,7 +58,8 @@ void add_device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_
                                                            PassThrough,
                                                            MaskingSpecialization::MaskDisabled>>>&
        instances);
+#endif
+#ifdef CK_ENABLE_BF16
 void add_device_batched_gemm_bias_masking_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemmSoftmaxGemmPermute<2,
@@ -100,7 +101,7 @@ void add_device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf
                                                            PassThrough,
                                                            MaskingSpecialization::MaskDisabled>>>&
        instances);
+#endif
 template <typename ADataType,
          typename B0DataType,
          typename B1DataType,
@@ -147,7 +148,7 @@ struct DeviceOperationInstanceFactory<
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef CK_ENABLE_FP16
        if constexpr(is_same_v<ADataType, half_t> && is_same_v<B0DataType, half_t> &&
                     is_same_v<B1DataType, half_t> && is_same_v<CDataType, half_t> &&
                     Acc0BiasDataType::Size() == 1 &&
@@ -164,6 +165,8 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
            }
        }
+#endif
+#ifdef CK_ENABLE_BF16
        else if constexpr(is_same_v<ADataType, BF16> && is_same_v<B0DataType, BF16> &&
                          is_same_v<B1DataType, BF16> && is_same_v<CDataType, BF16> &&
                          Acc0BiasDataType::Size() == 1 &&
@@ -180,6 +183,7 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef CK_ENABLE_FP16
 void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
                                                      Col,
@@ -111,3 +111,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
@@ -14,11 +14,12 @@
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef DL_KERNELS
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef CK_ENABLE_FP16
 void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
                                                        Row,
@@ -122,7 +123,8 @@ void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instan
                                                        PassThrough,
                                                        PassThrough,
                                                        PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_INT8
 void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
                                                        Row,
@@ -226,7 +228,7 @@ void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances
                                                        PassThrough,
                                                        PassThrough,
                                                        PassThrough>>>& instances);
+#endif
 template <typename ALayout,
          typename BLayout,
          typename ELayout,
@@ -261,7 +263,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef CK_ENABLE_FP16
        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                     is_same_v<EDataType, half_t>)
        {
@@ -294,6 +296,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                    op_ptrs);
            }
        }
+#endif
+#ifdef CK_ENABLE_INT8
        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
                          is_same_v<EDataType, int8_t>)
        {
@@ -326,7 +330,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                    op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };
@@ -335,3 +339,4 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
@@ -11,7 +11,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef CK_ENABLE_FP16
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -119,3 +119,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef CK_ENABLE_FP16
 void add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemmSoftmaxGemmPermute<2,
@@ -58,7 +58,8 @@ void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_g
                                                            PassThrough,
                                                            MaskingSpecialization::MaskDisabled>>>&
        instances);
+#endif
+#ifdef CK_ENABLE_BF16
 void add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemmSoftmaxGemmPermute<2,
@@ -100,6 +101,7 @@ void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf
                                                            PassThrough,
                                                            MaskingSpecialization::MaskDisabled>>>&
        instances);
+#endif
 template <typename ADataType,
          typename B0DataType,
@@ -146,7 +148,7 @@ struct DeviceOperationInstanceFactory<
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef CK_ENABLE_FP16
        if constexpr(is_same_v<ADataType, half_t> && is_same_v<B0DataType, half_t> &&
                     is_same_v<B1DataType, half_t> && is_same_v<CDataType, half_t>)
        {
@@ -161,6 +163,8 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
            }
        }
+#endif
+#ifdef CK_ENABLE_BF16
        else if constexpr(is_same_v<ADataType, BF16> && is_same_v<B0DataType, BF16> &&
                          is_same_v<B1DataType, BF16> && is_same_v<CDataType, BF16>)
        {
@@ -175,6 +179,7 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef CK_ENABLE_FP32
 // float
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -65,7 +65,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn
                                                           PassThrough,
                                                           PassThrough,
                                                           Bilinear>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP64
 // double
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -114,7 +115,7 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn
                                                           PassThrough,
                                                           PassThrough,
                                                           Bilinear>>>& instances);
+#endif
 // Contraction + Bilinear
 template <index_t NumDimM,
          index_t NumDimN,
@@ -149,7 +150,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef CK_ENABLE_FP32
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
                     is_same_v<DDataType, float> && is_same_v<EDataType, float>)
        {
@@ -165,7 +166,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                    op_ptrs);
            }
        }
+#endif
+#ifdef CK_ENABLE_FP64
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
                     is_same_v<DDataType, double> && is_same_v<EDataType, double>)
        {
@@ -181,7 +183,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                    op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef CK_ENABLE_FP32
 // float
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -65,7 +65,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instanc
                                                           PassThrough,
                                                           PassThrough,
                                                           Scale>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP64
 // double
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -114,7 +115,7 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instanc
                                                           PassThrough,
                                                           PassThrough,
                                                           Scale>>>& instances);
+#endif
 // Contraction + Scale
 template <index_t NumDimM,
          index_t NumDimN,
@@ -148,7 +149,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef CK_ENABLE_FP32
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
                     is_same_v<EDataType, float>)
        {
@@ -164,7 +165,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                    op_ptrs);
            }
        }
+#endif
+#ifdef CK_ENABLE_FP64
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
                     is_same_v<EDataType, double>)
        {
@@ -180,7 +182,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                    op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef CK_ENABLE_BF16
 // conv1d backward data
 void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<1,
@@ -29,17 +29,20 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP16
 void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(
    std::vector<std::unique_ptr<
        DeviceConvBwdData<1, NWC, KXC, NWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef CK_ENABLE_FP32
 void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(
    std::vector<std::unique_ptr<
        DeviceConvBwdData<1, NWC, KXC, NWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
-#ifdef __int8__
+#endif
+#ifdef CK_ENABLE_INT8
 void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<1,
                                                  NWC,
@@ -52,6 +55,7 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances);
 #endif
+#ifdef CK_ENABLE_BF16
 // conv2d backward data
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
@@ -64,7 +68,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP16
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
                                                  NHWC,
@@ -76,7 +81,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
                                                  NHWC,
@@ -88,7 +94,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
-#ifdef __int8__
+#endif
+#ifdef CK_ENABLE_INT8
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
                                                  NHWC,
@@ -101,6 +108,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances);
 #endif
+#ifdef DL_KERNELS
+#ifdef CK_ENABLE_FP16
 // conv2d dl
 void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
@@ -113,7 +122,8 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
 void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
                                                  NHWC,
@@ -125,7 +135,8 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
-#ifdef __int8__
+#endif
+#ifdef CK_ENABLE_INT8
 void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
                                                  NHWC,
@@ -138,6 +149,8 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances);
 #endif
+#endif
+#ifdef CK_ENABLE_BF16
 // conv3d backward data
 void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<3,
@@ -150,7 +163,8 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP16
 void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<3,
                                                  NDHWC,
@@ -162,7 +176,8 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
 void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<3,
                                                  NDHWC,
@@ -174,7 +189,8 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
-#ifdef __int8__
+#endif
+#ifdef CK_ENABLE_INT8
 void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<3,
                                                  NDHWC,
@@ -229,20 +245,23 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(op_ptrs);
            }
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+#ifdef CK_ENABLE_FP16
-                              is_same_v<OutDataType, half_t>)
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t>)
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(op_ptrs);
            }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+#endif
-                              is_same_v<WeiDataType, ck::bhalf_t> &&
+#ifdef CK_ENABLE_BF16
-                              is_same_v<OutDataType, ck::bhalf_t>)
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(op_ptrs);
            }
-#ifdef __int8__
+#endif
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+#ifdef CK_ENABLE_INT8
-                              is_same_v<OutDataType, int8_t>)
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t>)
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(op_ptrs);
            }
@@ -255,26 +274,35 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
                         is_same_v<OutDataType, float>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
+#endif
            }
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+#ifdef CK_ENABLE_FP16
-                              is_same_v<OutDataType, half_t>)
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
+#endif
            }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+#endif
-                              is_same_v<WeiDataType, ck::bhalf_t> &&
+#ifdef CK_ENABLE_BF16
-                              is_same_v<OutDataType, ck::bhalf_t>)
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(op_ptrs);
            }
-#ifdef __int8__
+#endif
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+#ifdef CK_ENABLE_INT8
-                              is_same_v<OutDataType, int8_t>)
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
+#endif
            }
 #endif
        }
@@ -286,20 +314,23 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(op_ptrs);
            }
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+#ifdef CK_ENABLE_FP16
-                              is_same_v<OutDataType, half_t>)
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t>)
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(op_ptrs);
            }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+#endif
-                              is_same_v<WeiDataType, ck::bhalf_t> &&
+#ifdef CK_ENABLE_BF16
-                              is_same_v<OutDataType, ck::bhalf_t>)
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(op_ptrs);
            }
-#ifdef __int8__
+#endif
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+#ifdef CK_ENABLE_INT8
-                              is_same_v<OutDataType, int8_t>)
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t>)
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(op_ptrs);
            }

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
@@ -18,11 +18,17 @@ namespace device {
 namespace instance {
 // conv2d forward
+#ifdef CK_ENABLE_FP16
 void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
    std::vector<std::unique_ptr<
        DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#ifdef CK_ENABLE_BF16
 void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvFwd<2,
                                              NHWC,
@@ -34,17 +40,14 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
                                              PassThrough,
                                              PassThrough,
                                              PassThrough>>>& instances);
+#endif
-void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
+#ifdef CK_ENABLE_FP32
-    std::vector<std::unique_ptr<
-        DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
 void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
    std::vector<std::unique_ptr<
        DeviceConvFwd<2, NHWC, KYXC, NHWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef CK_ENABLE_INT8
 void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvFwd<2,
                                              NHWC,
@@ -56,6 +59,7 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
                                              PassThrough,
                                              PassThrough,
                                              PassThrough>>>& instances);
+#endif
 template <ck::index_t NumDimSpatial,
          typename InLayout,
@@ -99,23 +103,29 @@ struct DeviceOperationInstanceFactory<
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
            }
+#ifdef CK_ENABLE_FP16
            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                              is_same_v<OutDataType, half_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
                add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
            }
+#endif
+#ifdef CK_ENABLE_BF16
            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
                              is_same_v<WeiDataType, ck::bhalf_t> &&
                              is_same_v<OutDataType, ck::bhalf_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(op_ptrs);
            }
+#endif
+#ifdef CK_ENABLE_INT8
            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                              is_same_v<OutDataType, int8_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
            }
+#endif
        }
        return op_ptrs;

--- a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
@@ -11,7 +11,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef CK_ENABLE_FP16
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -77,3 +77,4 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceElemen
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -17,11 +17,17 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
+#if defined(CK_ENABLE_FP16) && defined(DL_KERNELS)
 void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -32,6 +38,11 @@ void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -42,6 +53,11 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -52,15 +68,20 @@ void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
+void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#if defined(CK_ENABLE_FP32) && defined(DL_KERNELS)
 void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
 void add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(
@@ -77,7 +98,8 @@ void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
-#ifdef __int8__
+#endif
+#if defined(CK_ENABLE_INT8) && defined(DL_KERNELS)
 void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
@@ -117,7 +139,8 @@ void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef CK_ENABLE_INT8
 void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
@@ -138,90 +161,93 @@ void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
        instances);
 #endif
+#ifdef CK_ENABLE_FP16
 void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances(
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances(
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances(
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
+void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
+void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
+void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
+void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
-void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
+#ifdef CK_ENABLE_FP32
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
-        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
 void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(
@@ -243,7 +269,8 @@ void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef CK_ENABLE_FP64
 void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(
    std::vector<std::unique_ptr<
@@ -264,7 +291,7 @@ void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F64, F64, F64, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
 template <typename ALayout,
          typename BLayout,
          typename CLayout,
@@ -303,31 +330,40 @@ struct DeviceOperationInstanceFactory<
                         is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
+#endif
                add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
+#endif
                add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                              is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(op_ptrs);
+#endif
                add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
                              is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(op_ptrs);
+#endif
                add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(op_ptrs);
            }
        }
+#ifdef CK_ENABLE_FP16
        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                          is_same_v<CDataType, half_t>)
        {
@@ -335,16 +371,22 @@ struct DeviceOperationInstanceFactory<
                         is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+#endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+#endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
            }
@@ -352,19 +394,27 @@ struct DeviceOperationInstanceFactory<
                              is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+#endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
                              is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+#endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
            }
        }
+#endif
+#ifdef CK_ENABLE_BF16
        else if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
                          is_same_v<CDataType, ck::bhalf_t>)
        {
@@ -389,7 +439,8 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances(op_ptrs);
            }
        }
-#ifdef __int8__
+#endif
+#ifdef CK_ENABLE_INT8
        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
                          is_same_v<CDataType, int8_t>)
        {
@@ -397,29 +448,37 @@ struct DeviceOperationInstanceFactory<
                         is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
                add_device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances(op_ptrs);
+#endif
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
                add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances(op_ptrs);
+#endif
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                              is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(op_ptrs);
                add_device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances(op_ptrs);
+#endif
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
                              is_same_v<CLayout, Row>)
            {
                add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(op_ptrs);
                add_device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances(op_ptrs);
+#endif
            }
        }
 #endif

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp
@@ -9,7 +9,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef CK_ENABLE_FP16
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -170,3 +170,4 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
@@ -11,7 +11,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef CK_ENABLE_FP16
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -220,3 +220,4 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
@@ -10,7 +10,7 @@
 #include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#ifdef CK_ENABLE_FP16
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -136,3 +136,4 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    MultiplyAdd>>>&);
+void add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    MultiplyAdd>>>&);
+void add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F8,
+                                                    F32_F32_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    MultiplyAdd>>>&);
+void add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F8,
+                                                    F32_F32_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    MultiplyAdd>>>&);
+// GEMM + Multiply + Add
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleD<
+    ALayout,
+    BLayout,
+    ck::Tuple<D0Layout, D1Layout>,
+    ELayout,
+    ADataType,
+    BDataType,
+    ck::Tuple<D0DataType, D1DataType>,
+    EDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::MultiplyAdd>>
+{
+    using DeviceOp = DeviceGemmMultipleD<ALayout,
+                                         BLayout,
+                                         ck::Tuple<D0Layout, D1Layout>,
+                                         ELayout,
+                                         ADataType,
+                                         BDataType,
+                                         ck::Tuple<D0DataType, D1DataType>,
+                                         EDataType,
+                                         ck::tensor_operation::element_wise::PassThrough,
+                                         ck::tensor_operation::element_wise::PassThrough,
+                                         ck::tensor_operation::element_wise::MultiplyAdd>;
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<D0DataType, half_t> && is_same_v<D1DataType, half_t> &&
+                     is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<D0DataType, float> && is_same_v<D1DataType, float> &&
+                     is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+        return op_ptrs;
+    }
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
@@ -57,6 +57,46 @@ void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(
        DeviceGemmSplitK<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
+void add_device_gemm_xdl_splitk_f8_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_splitk_f8_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_splitk_f8_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_splitk_f16_f8_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_splitk_f16_f8_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_splitk_f16_f8_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 template <typename ADataType,
          typename BDataType,
          typename CDataType,
@@ -136,6 +176,54 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(op_ptrs);
            }
        }
+        else if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, half_t> &&
+                          is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f8_f16_f16_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f8_f16_f16_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f8_f16_f16_km_nk_mn_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
+                          is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f8_f16_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f8_f16_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f8_f16_km_nk_mn_instances(op_ptrs);
+            }
+        }
        return op_ptrs;
    }

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_streamk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+#ifdef CK_ENABLE_FP16
+void add_device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmStreamK<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmStreamK<
+    ALayout,
+    BLayout,
+    CLayout,
+    ADataType,
+    BDataType,
+    CDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGemmStreamK<ALayout,
+                                       BLayout,
+                                       CLayout,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       ck::tensor_operation::element_wise::PassThrough,
+                                       ck::tensor_operation::element_wise::PassThrough,
+                                       ck::tensor_operation::element_wise::PassThrough>;
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#if 0
+        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
+                     is_same_v<CDataType, float>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                          is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+            }
+        }
+#endif
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+            }
+        }
+        return op_ptrs;
+    }
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif