Merge remote-tracking branch 'origin/develop' into migx-flash-attn

4100d1d8 · Alan Turner · 48717006 · c8a8385f · 4100d1d8 · 4100d1d8
Commit 4100d1d8 authored Aug 23, 2023 by Alan Turner
20 changed files
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_FP32
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -117,3 +117,4 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -123,3 +123,4 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
@@ -30,7 +30,12 @@ using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std:
    //###################|<in, mean, square_mean, gamma, beta>| <out>|  functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   8,       Sequence<8, 1, 1, 8, 8>,      Sequence<8>                >,
    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   4,       Sequence<4, 1, 1, 4, 4>,      Sequence<4>                >,
-    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   2,       Sequence<2, 1, 1, 2, 2>,      Sequence<2>                >,
+    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   2,       Sequence<2, 1, 1, 2, 2>,      Sequence<2>                >
+    // clang-format on
+    >;
+
+using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance = std::tuple<
+    // clang-format off
    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   1,       Sequence<1, 1, 1, 1, 1>,      Sequence<1>                >
    // clang-format on
    >;
@@ -39,6 +44,9 @@ void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
    std::vector<DeviceElementwisePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
        instances)
 {
+    add_device_operation_instances(
+        instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance{});
+
    add_device_operation_instances(
        instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{});
 }

--- a/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 add_instance_library(device_elementwise_normalization_instance
    device_elementwise_normalization_f16_instance.cpp
 )
+endif()
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
-add_instance_library(device_gemm_instance
-   device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
-   device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
-   device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
-   device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
-   device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
-   device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
-   device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
-   device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
-   device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
-   device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
-   device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
-   device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
-   device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
-   device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
-   device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
-   device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
-   device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
-   device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
-   device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
-   device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
-   device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
-   device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
-   device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp
-   device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
-   device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp
-   device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
-   device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
-   device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
-   device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp
-   device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
-   device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
-   device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
-   device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
-)
+set(GEMM_INSTANCES)
+if(DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
+    list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp)
+    list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp)
+    list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp)
+    list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp)
+   if(DL_KERNELS)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp)
+   endif()
+endif()
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+   if(DL_KERNELS)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_dpp8_f16_f16_f16_km_kn_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_dpp8_f16_f16_f16_km_nk_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_dpp8_f16_f16_f16_mk_kn_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_dpp8_f16_f16_f16_mk_nk_mn_instance.cpp)
+   endif()
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v2_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_add_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v2_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_add_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v2_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_add_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp)
+endif()
+if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+   if(DL_KERNELS)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp)
+   endif()
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp)
+endif()
+if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp)
+   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp)
+endif()
+
+add_instance_library(device_gemm_instance ${GEMM_INSTANCES})
+
+
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  set(ENABLE_PIPELINE_V2_OPT OFF)
+
+  if (ENABLE_PIPELINE_V2_OPT)
+    set(MAX_ILP_OPTS
+       -mllvm
+       -amdgpu-enable-max-ilp-scheduling-strategy
+    )
+    set(WAVES_PER_EU_DEFS
+       CK_USE_WAVES_PER_EU=1
+       CK_MIN_WAVES_PER_EU=1
+       CK_MAX_WAVES_PER_EU=1
+    )
+    set(IGLP_OPT_DEFS
+       CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1
+    )
+
+    # layout=NT
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS ";;"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+    # layout=NN
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS "${MAX_ILP_OPTS}"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+    # layout=TT
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS ";;"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+    # layout=TN
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS "${MAX_ILP_OPTS}"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+  endif(ENABLE_PIPELINE_V2_OPT)
+endif(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_dpp8_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_dpp8_f16_f16_f16_km_kn_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_dl_dpp8_f16_f16_f16_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        // ##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|      CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|         SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|                Order|                 |                   |
+        // ##########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                     |                 |                   |
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          8,      1,       S<1, 8>,       S<1, 1>,      S<1, 1, 4, 2>,       S<4, 1,  2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<4, 1>,      S<1, 1, 4, 2>,      S<16, 1,  2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 2, 2>,      S<16, 1, 2, 2>,       S<1, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 2, 1>,      S<0, 3, 1, 2>,        S<1, 1, 2, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<4, 1>,      S<1, 1, 4, 2>,      S<16, 1,  2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 2, 2>,       S<4, 1, 8, 2>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<8, 1>,      S<1, 1, 2, 2>,      S<16, 1,  4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 2, 1>,      S<0, 3, 1, 2>,       S<1, 1, 2, 2>,       S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          8,      1,       S<2, 8>,       S<4, 1>,      S<2, 1, 4, 2>,      S<4, 1,  16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    16,  2,          8,          1,      1,       S<1, 1>,       S<8, 8>,      S<1, 1, 2, 2>,      S<16, 1,  4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 2, 1>,      S<0, 3, 1, 2>,       S<1, 1, 2, 2>,       S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    16,  2,          4,          8,      1,       S<2, 8>,       S<8, 1>,      S<2, 1, 4, 2>,      S<8, 1,  16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<2, 1, 4, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          1,          8,      8,       S<4, 8>,       S<4, 1>,      S<4, 1, 4, 2>,      S<4, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<2, 1, 4, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  2,          1,          8,      8,       S<8, 8>,       S<4, 1>,      S<1, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<1, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          8,      8,       S<2, 8>,      S<16, 1>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_dpp8_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_dpp8_f16_f16_f16_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_dpp8_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_dpp8_f16_f16_f16_km_nk_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_dl_dpp8_f16_f16_f16_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        // ##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|      CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|         SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|                Order|                 |                   |
+        // ##########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                     |                 |                   |
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          8,      1,       S<1, 8>,       S<1, 1>,      S<1, 1, 4, 2>,       S<4, 1,  2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<4, 1, 1, 2>,      S<1, 1,   8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<4, 1>,      S<1, 1, 4, 2>,      S<16, 1,  2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 2, 2>,      S<16, 1, 2, 2>,      S<1, 1,  32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<4, 1>,      S<1, 1, 4, 2>,      S<16, 1,  2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 2, 2>,       S<4, 1, 8, 2>,      S<4, 1,   8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<8, 1>,      S<1, 1, 2, 2>,      S<16, 1,  4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 2, 1>,      S<0, 3, 1, 2>,       S<1, 1, 2, 2>,       S<4, 1, 4, 2>,      S<4, 1,  16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          8,      1,       S<2, 8>,       S<4, 1>,      S<2, 1, 4, 2>,      S<4, 1,  16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<4, 1, 2, 2>,      S<2, 1,  32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    16,  2,          8,          1,      1,       S<1, 1>,       S<8, 8>,      S<1, 1, 2, 2>,      S<16, 1,  4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 2, 1>,      S<0, 3, 1, 2>,       S<1, 1, 2, 2>,       S<4, 1, 4, 2>,      S<4, 1,  16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    16,  2,          4,          8,      1,       S<2, 8>,       S<8, 1>,      S<2, 1, 4, 2>,      S<8, 1,  16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<4, 1, 2, 2>,      S<4, 1,  32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          1,          8,      8,       S<4, 8>,       S<4, 1>,      S<4, 1, 4, 2>,      S<4, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<4, 1, 2, 2>,      S<4, 1,  32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  2,          1,          8,      8,       S<8, 8>,       S<4, 1>,      S<1, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<4, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          8,      8,       S<2, 8>,      S<16, 1>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,       S<4, 1, 2, 2>,      S<4, 1,  64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_dpp8_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_dpp8_f16_f16_f16_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_dpp8_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_dpp8_f16_f16_f16_mk_kn_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_dl_dpp8_f16_f16_f16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        // ##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|      CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|         SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|                Order|                 |                   |
+        // ##########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                     |                 |                   |
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          8,      1,       S<1, 8>,       S<1, 1>,      S<4, 1, 1, 2>,       S<1, 1,  8, 1>,  S<1, 2, 0 ,3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<4, 1>,      S<4, 1, 1, 2>,        S<4, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<16, 1, 2, 2>,       S<1, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 2, 1>,      S<0, 3, 1, 2>,        S<1, 1, 2, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<4, 1>,      S<4, 1, 1, 2>,        S<4, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 8, 2>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<8, 1>,      S<2, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<2, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          8,      1,       S<2, 8>,       S<4, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    16,  2,          8,          1,      1,       S<1, 1>,       S<8, 8>,      S<2, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<2, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    16,  2,          4,          8,      1,       S<2, 8>,       S<8, 1>,      S<4, 1, 2, 2>,       S<4, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<2, 1, 4, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          1,          8,      8,       S<4, 8>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<2, 1, 4, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  2,          1,          8,      8,       S<8, 8>,       S<4, 1>,      S<4, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<1, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          8,      8,       S<2, 8>,      S<16, 1>,      S<4, 1, 2, 2>,       S<4, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_dpp8_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_dpp8_f16_f16_f16_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_dpp8_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_dpp8_f16_f16_f16_mk_nk_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_dl_dpp8_f16_f16_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        // ##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|      CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|         SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|                Order|                 |                   |
+        // ##########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                     |                 |                   |
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          8,      1,       S<1, 8>,       S<1, 1>,      S<4, 1, 1, 2>,       S<1, 1,  8, 1>,  S<1, 2, 0 ,3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 1, 2>,      S<1, 1,   8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<4, 1>,      S<4, 1, 1, 2>,        S<4, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<16, 1, 2, 2>,      S<1, 1,  32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<4, 1>,      S<4, 1, 1, 2>,        S<4, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 8, 2>,      S<4, 1,   8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    16,  2,          1,          8,      1,       S<1, 8>,       S<8, 1>,      S<2, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<2, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 4, 2>,      S<4, 1,  16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          8,      1,       S<2, 8>,       S<4, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 2, 2>,      S<2, 1,  32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    16,  2,          8,          1,      1,       S<1, 1>,       S<8, 8>,      S<2, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<2, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 4, 2>,      S<4, 1,  16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    16,  2,          4,          8,      1,       S<2, 8>,       S<8, 1>,      S<4, 1, 2, 2>,       S<4, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 2, 2>,      S<4, 1,  32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          1,          8,      8,       S<4, 8>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 2, 2>,      S<4, 1,  32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  2,          1,          8,      8,       S<8, 8>,       S<4, 1>,      S<4, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDlDpp8< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          8,      8,       S<2, 8>,      S<16, 1>,      S<4, 1, 2, 2>,       S<4, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,       S<4, 1, 2, 2>,      S<4, 1,  64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,  S<0, 1, 2, 3, 4, 5>,                5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_dpp8_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_dpp8_f16_f16_f16_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -80,3 +80,4 @@ void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -78,3 +78,4 @@ void add_device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -80,3 +80,4 @@ void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -78,3 +78,4 @@ void add_device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -80,3 +80,4 @@ void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -78,3 +78,4 @@ void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -80,3 +80,4 @@ void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -78,3 +78,4 @@ void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -66,3 +66,4 @@ void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -66,3 +66,4 @@ void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -66,3 +66,4 @@ void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif