Merge branch 'develop' into amd-develop

705d5a08 · Jun Liu · d4ad52d6 · 8f84a012 · 705d5a08 · 705d5a08
Commit 705d5a08 authored Sep 08, 2023 by Jun Liu
20 changed files
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_dl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f16_instances<NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f16_instances<NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f16_instances<NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_dl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f32_instances<NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f32_instances<NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f32_instances<NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv2d_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[g, n, hi ,wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_wmma_f16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_wmma_f16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_wmma_f16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_wmma_f16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv2d_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[g, n, hi ,wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_wmma_i8_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_wmma_i8_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_wmma_i8_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_wmma_i8_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
@@ -83,7 +83,6 @@ using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instanc
        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v2>,
        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v2>
    // clang-format on
-    // clang-format on
    >;

 void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(

--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/CMakeLists.txt
+add_instance_library(device_grouped_gemm_bias_instance
+   device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_instance.cpp
+
+   device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_instance.cpp
+   device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_instance.cpp
+)
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using D0DataType = F32;
+using DsDataType = ck::Tuple<D0DataType>;
+
+using D0Layout = Row;
+using DsLayout = ck::Tuple<D0Layout>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 16,16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2, S< 1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2, S< 1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2, S< 1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2, S< 1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>
+        // clang-format on
+        >;
+
+void add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmFixedNK<Row,
+                                                         Row,
+                                                         DsLayout,
+                                                         Row,
+                                                         F16,
+                                                         F16,
+                                                         DsDataType,
+                                                         F16,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Add>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using D0DataType = F32;
+using DsDataType = ck::Tuple<D0DataType>;
+
+using D0Layout = Row;
+using DsLayout = ck::Tuple<D0Layout>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,   256,    64,   8,   8,   32,   32,    2,    4,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,   128,    64,   8,   8,   32,   32,    2,    2,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,    64,    64,   8,   8,   32,   32,    2,    1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,    64,   128,    64,   8,   8,   32,   32,    1,    2,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,   128,   128,    64,   8,   8,   32,   32,    4,    2,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,   128,    64,    64,   8,   8,   32,   32,    2,    2,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    64,   128,    64,   8,   8,   32,   32,    2,    2,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,   128,    32,    64,   8,   8,   32,   32,    2,    1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    32,   128,    64,   8,   8,   32,   32,    1,    2,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    32,   256,    64,   8,   8,   32,   32,    1,    4,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,    64,    64,    64,    64,   8,   8,   32,   32,    2,    2,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,    64,    64,    32,    64,   8,   8,   32,   32,    2,    1,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F16, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,    64,    32,    64,    64,   8,   8,   32,   32,    1,    2,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>
+        // clang-format on
+        >;
+
+void add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmFixedNK<Row,
+                                                         Col,
+                                                         DsLayout,
+                                                         Row,
+                                                         F16,
+                                                         F16,
+                                                         DsDataType,
+                                                         F16,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Add>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using D0DataType = F32;
+using DsDataType = ck::Tuple<D0DataType>;
+
+using D0Layout = Row;
+using DsLayout = ck::Tuple<D0Layout>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 16,16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2, S< 1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,    32,   128,    32,   8,   8,   32,   32,    1,    1, S< 1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2, S< 1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2, S< 1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2, S< 1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2, S< 1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2, S< 1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,    32,   256,    32,   8,   8,   32,   32,    1,    2, S< 1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Row,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    16,   128,    32,   8,   8,   16,   16,    1,    4, S< 1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              4>
+        // clang-format on
+        >;
+
+void add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmFixedNK<Row,
+                                                         Row,
+                                                         DsLayout,
+                                                         Row,
+                                                         F16,
+                                                         F16,
+                                                         DsDataType,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Add>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using D0DataType = F32;
+using DsDataType = ck::Tuple<D0DataType>;
+
+using D0Layout = Row;
+using DsLayout = ck::Tuple<D0Layout>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,   256,    64,   8,   8,   32,   32,    2,    4,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,   128,    64,   8,   8,   32,   32,    2,    2,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,   128,    64,    64,   8,   8,   32,   32,    2,    1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,    64,   128,    64,   8,   8,   32,   32,    1,    2,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   256,    32,   128,    64,   8,   8,   32,   32,    1,    1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,   128,   128,    64,   8,   8,   32,   32,    4,    2,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,   128,    64,    64,   8,   8,   32,   32,    2,    2,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    64,   128,    64,   8,   8,   32,   32,    2,    2,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,   128,    32,    64,   8,   8,   32,   32,    2,    1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    32,   128,    64,   8,   8,   32,   32,    1,    2,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,   128,    32,   256,    64,   8,   8,   32,   32,    1,    4,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,    64,    64,    64,    64,   8,   8,   32,   32,    2,    2,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,    64,    64,    32,    64,   8,   8,   32,   32,    2,    1,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              4>,
+        DeviceGroupedGemm_Xdl_Fixed_NK<    Row,    Col,    DsLayout,    Row,   F16,   F16,     F32,      F32,  DsDataType,   F32, PassThrough, PassThrough,         Add, GemmMNKPadding,        1,    64,    32,    64,    64,   8,   8,   32,   32,    1,    2,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8,  8, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              4>
+        // clang-format on
+        >;
+
+void add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmFixedNK<Row,
+                                                         Col,
+                                                         DsLayout,
+                                                         Row,
+                                                         F16,
+                                                         F16,
+                                                         DsDataType,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Add>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt
+add_instance_library(device_image_to_column_instance
+   device_image_to_column_nhwc_1d_instance.cpp
+   device_image_to_column_nhwc_2d_instance.cpp
+   device_image_to_column_nhwc_3d_instance.cpp
+)
--- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_image_to_column_nhwc_1d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, BF16, BF16>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_bf16_instances<1, GNWC>{});
+}
+
+void add_device_image_to_column_nhwc_1d_f16_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, F16, F16>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_f16_instances<1, GNWC>{});
+}
+
+void add_device_image_to_column_nhwc_1d_f32_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, F32, F32>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_f32_instances<1, GNWC>{});
+}
+
+void add_device_image_to_column_nhwc_1d_i8_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, int8_t, int8_t>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_i8_instances<1, GNWC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_image_to_column_nhwc_2d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, BF16, BF16>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_bf16_instances<2, GNHWC>{});
+}
+
+void add_device_image_to_column_nhwc_2d_f16_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, F16, F16>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_f16_instances<2, GNHWC>{});
+}
+
+void add_device_image_to_column_nhwc_2d_f32_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, F32, F32>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_f32_instances<2, GNHWC>{});
+}
+
+void add_device_image_to_column_nhwc_2d_i8_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, int8_t, int8_t>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_i8_instances<2, GNHWC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_image_to_column_nhwc_3d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, BF16, BF16>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_bf16_instances<3, GNDHWC>{});
+}
+
+void add_device_image_to_column_nhwc_3d_f16_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, F16, F16>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_f16_instances<3, GNDHWC>{});
+}
+
+void add_device_image_to_column_nhwc_3d_f32_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, F32, F32>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_f32_instances<3, GNDHWC>{});
+}
+
+void add_device_image_to_column_nhwc_3d_i8_instances(
+    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, int8_t, int8_t>>>& instances)
+{
+    add_device_operation_instances(instances, device_image_to_column_i8_instances<3, GNDHWC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt
+set(DEVICE_MAXPOOL_BWD_INSTANCES)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_MAXPOOL_BWD_INSTANCES device_max_pool_bwd_f16_instance.cpp)
+endif()
+if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_MAXPOOL_BWD_INSTANCES device_max_pool_bwd_bf16_instance.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_MAXPOOL_BWD_INSTANCES device_max_pool_bwd_f32_instance.cpp)
+endif()
+add_instance_library(device_max_pool_bwd_instance ${DEVICE_MAXPOOL_BWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "max_pool_bwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_maxpool_bwd_bf16_instances(
+    std::vector<std::unique_ptr<DeviceMaxPoolBwd<BF16, I32, BF16>>>& instances)
+{
+    add_device_operation_instances(instances, device_maxpool_bwd_instances<BF16, I32, BF16>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp
+++ b/include/ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-
-enum struct GemmDlAlgorithm
-{
-    Default, // Uses DOT vector instructions
-    Dpp8,    // Uses DOT vector instructions with DPP8 SEL modifier to reduce data loads from LDS
-};
-
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "max_pool_bwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_maxpool_bwd_f16_instances(
+    std::vector<std::unique_ptr<DeviceMaxPoolBwd<F16, I32, F16>>>& instances)
+{
+    add_device_operation_instances(instances, device_maxpool_bwd_instances<F16, I32, F16>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "max_pool_bwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_maxpool_bwd_f32_instances(
+    std::vector<std::unique_ptr<DeviceMaxPoolBwd<F32, I32, F32>>>& instances)
+{
+    add_device_operation_instances(instances, device_maxpool_bwd_instances<F32, I32, F32>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/max_pool_bwd/max_pool_bwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/max_pool_bwd/max_pool_bwd_instance_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I32  = int32_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+template <typename DOutDataType, typename IndexDataType, typename DInDataType>
+using device_maxpool_bwd_instances =
+    // clang-format off
+    std::tuple <
+        DeviceMaxPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 1>,
+        DeviceMaxPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 2>,
+        DeviceMaxPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>
+               // clang-format on
+               >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
@@ -3,6 +3,10 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
    list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
                                          device_max_pool3d_fwd_ndhwc_f16_instance.cpp)
 endif()
+if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp
+                                          device_max_pool3d_fwd_ndhwc_bf16_instance.cpp)
+endif()
 if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
    list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
                                          device_max_pool3d_fwd_ndhwc_f32_instance.cpp)