Merge branch 'develop' into mi300_time_measurement

4396a224 · Harisankar Sadasivan · GitHub · 0a27f07e · 501a6b68 · 4396a224
Unverified Commit 4396a224 authored Apr 16, 2024 by Harisankar Sadasivan Committed by GitHub Apr 16, 2024
20 changed files
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <index_t kMPerTile, index_t kNPerTile, index_t kKPerTile>
+struct TileGemmShape
+{
+    static constexpr index_t kM = kMPerTile;
+    static constexpr index_t kN = kNPerTile;
+    static constexpr index_t kK = kKPerTile;
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
+
+namespace ck_tile {
+
+// fp16
+using WarpGemmMfmaF16F16F32M32N32K8 =
+    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K8>>;
+
+using WarpGemmMfmaF16F16F32M16N16K16 =
+    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16>>;
+
+using WarpGemmMfmaF16F16F32M32N32K16 =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplF16F16F32M32N32K8, 2>>;
+
+using WarpGemmMfmaF16F16F32M16N16K32 =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplF16F16F32M16N16K16, 2>>;
+
+using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution = WarpGemmImpl<
+    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplF16F16F32M32N32K8>>;
+
+using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution = WarpGemmImpl<
+    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplF16F16F32M16N16K16>>;
+
+using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K8,
+        2>>;
+
+using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+        WarpGemmAttributeMfmaImplF16F16F32M16N16K16,
+        2>>;
+
+using WarpGemmMfmaF16F16F32M16N16K32SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K8,
+        2>>;
+
+// bf16
+using WarpGemmMfmaBf16Bf16F32M32N32K8 =
+    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8>>;
+
+using WarpGemmMfmaBf16Bf16F32M16N16K16 =
+    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16>>;
+
+using WarpGemmMfmaBf16Bf16F32M32N32K16 =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8, 2>>;
+
+using WarpGemmMfmaBf16Bf16F32M16N16K32 =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16, 2>>;
+
+using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution = WarpGemmImpl<
+    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8>>;
+
+using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution = WarpGemmImpl<
+    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16>>;
+
+using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8,
+        2>>;
+
+using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16,
+        2>>;
+
+using WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8,
+        2>>;
+
+// fp8
+using WarpGemmMfma_f32_32x32x16_fp8_fp8 =
+    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8>>;
+
+using WarpGemmMfma_f32_32x32x16_fp8_bf8 =
+    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8>>;
+
+using WarpGemmMfma_f32_32x32x16_bf8_fp8 =
+    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8>>;
+
+using WarpGemmMfma_f32_32x32x16_bf8_bf8 =
+    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8>>;
+
+using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed = WarpGemmImpl<
+    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8>>;
+
+using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed = WarpGemmImpl<
+    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8>>;
+
+using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed = WarpGemmImpl<
+    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8>>;
+
+using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed = WarpGemmImpl<
+    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8>>;
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
+
+namespace ck_tile {
+
+template <typename WarpGemmAttributeMfmaImpl_>
+struct WarpGemmAtrributeMfma
+{
+    using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+
+    using ADataType = typename Impl::ADataType;
+    using BDataType = typename Impl::BDataType;
+    using CDataType = typename Impl::CDataType;
+
+    using AVecType = typename Impl::AVecType;
+    using BVecType = typename Impl::BVecType;
+    using CVecType = typename Impl::CVecType;
+
+    static constexpr index_t kM = Impl::kM;
+    static constexpr index_t kN = Impl::kN;
+    static constexpr index_t kK = Impl::kK;
+
+    using AWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>,
+              sequence<Impl::kCNLane>>,
+        tuple<sequence<1, 2>>,
+        tuple<sequence<1, 0>>,
+        sequence<1, 1>,
+        sequence<0, 2>>;
+
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        Impl{}(c_vec, a_vec, b_vec);
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        return Impl{}(a_vec, b_vec);
+    }
+};
+
+template <typename WarpGemmAttributeMfmaImpl_, index_t kKIter>
+struct WarpGemmAtrributeMfmaIterateK
+{
+    static_assert(kKIter > 0, "wrong!");
+
+    using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+
+    using ADataType = typename Impl::ADataType;
+    using BDataType = typename Impl::BDataType;
+    using CDataType = typename Impl::CDataType;
+
+    using AVecType =
+        ext_vector_t<ADataType, vector_traits<typename Impl::AVecType>::vector_size * kKIter>;
+    using BVecType =
+        ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
+    using CVecType = typename Impl::CVecType;
+
+    static constexpr index_t kM = Impl::kM;
+    static constexpr index_t kN = Impl::kN;
+    static constexpr index_t kK = Impl::kK * kKIter;
+
+    using AWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>,
+              sequence<Impl::kCNLane>>,
+        tuple<sequence<1, 2>>,
+        tuple<sequence<1, 0>>,
+        sequence<1, 1>,
+        sequence<0, 2>>;
+
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+
+        static_for<0, kKIter, 1>{}([&](auto iKIter) {
+            Impl{}(c_vec,
+                   reinterpret_cast<const buf_a>(a_vec)
+                       .template get_as<typename Impl::AVecType>()[iKIter],
+                   reinterpret_cast<const buf_b>(b_vec)
+                       .template get_as<typename Impl::BVecType>()[iKIter]);
+        });
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        constexpr auto I0 = number<0>{};
+        using buf_a       = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b       = thread_buffer<typename Impl::BVecType, kKIter>;
+
+        // c = a * b
+        auto c_vec = Impl{}(
+            reinterpret_cast<const buf_a>(a_vec).template get_as<typename Impl::AVecType>()[I0],
+            reinterpret_cast<const buf_b>(b_vec).template get_as<typename Impl::BVecType>()[I0]);
+
+        // c += a * b
+        static_for<1, kKIter, 1>{}([&](auto iKIter) {
+            Impl{}(c_vec,
+                   reinterpret_cast<const buf_a>(a_vec)
+                       .template get_as<typename Impl::AVecType>()[iKIter],
+                   reinterpret_cast<const buf_b>(b_vec)
+                       .template get_as<typename Impl::BVecType>()[iKIter]);
+        });
+
+        return c_vec;
+    }
+};
+
+template <typename WarpGemmAttributeMfmaImpl_>
+struct WarpGemmAtrributeMfmaTransposedCDistribution
+{
+    using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+
+    using ADataType = typename Impl::BDataType;
+    using BDataType = typename Impl::ADataType;
+    using CDataType = typename Impl::CDataType;
+
+    using AVecType = typename Impl::BVecType;
+    using BVecType = typename Impl::AVecType;
+    using CVecType = typename Impl::CVecType;
+
+    static constexpr index_t kM = Impl::kN;
+    static constexpr index_t kN = Impl::kM;
+    static constexpr index_t kK = Impl::kK;
+
+    using AWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCNLane>,
+              sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<1, 0>>,
+        sequence<2, 2>,
+        sequence<0, 2>>;
+
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        // swap A and B
+        Impl{}(c_vec, b_vec, a_vec);
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        // swap A and B
+        return Impl{}(b_vec, a_vec);
+    }
+};
+
+template <typename WarpGemmAttributeMfmaImpl_>
+struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
+{
+    using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+
+    using ADataType = typename Impl::BDataType;
+    using BDataType = typename Impl::ADataType;
+    using CDataType = typename Impl::CDataType;
+
+    using AVecType = typename Impl::BVecType;
+    using BVecType = typename Impl::AVecType;
+    using CVecType = typename Impl::CVecType;
+
+    static constexpr index_t kM = Impl::kN;
+    static constexpr index_t kN = Impl::kM;
+    static constexpr index_t kK = Impl::kK;
+
+    using AWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane / (Impl::kABKPerLane * Impl::kABKLane * 2),
+                       Impl::kABKLane,
+                       2,
+                       Impl::kABKPerLane>,
+              sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1, 1, 1, 1>>,
+        tuple<sequence<0, 0, 2, 1, 3>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCNLane>,
+              sequence<Impl::kCM0PerLane / 2, Impl::kCMLane, Impl::kCM1PerLane * 2>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<1, 0>>,
+        sequence<2, 2>,
+        sequence<0, 2>>;
+
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        // swap A and B
+        Impl{}(c_vec, b_vec, a_vec);
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        // swap A and B
+        return Impl{}(b_vec, a_vec);
+    }
+};
+
+template <typename WarpGemmAttributeMfmaImpl_, index_t kKIter>
+struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
+{
+    using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+
+    // swap A and B
+    using ADataType = typename Impl::BDataType;
+    using BDataType = typename Impl::ADataType;
+    using CDataType = typename Impl::CDataType;
+
+    using AVecType =
+        ext_vector_t<ADataType, vector_traits<typename Impl::AVecType>::vector_size * kKIter>;
+    using BVecType =
+        ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
+    using CVecType = typename Impl::CVecType;
+
+    static constexpr index_t kM = Impl::kN;
+    static constexpr index_t kN = Impl::kM;
+    static constexpr index_t kK = Impl::kK * kKIter;
+
+    using AWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCNLane>,
+              sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<1, 0>>,
+        sequence<2, 2>,
+        sequence<0, 2>>;
+
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+        // swap A and B, value and type
+        static_for<0, kKIter, 1>{}([&](auto iKIter) {
+            Impl{}(c_vec,
+                   reinterpret_cast<const buf_b&>(b_vec)
+                       .template get_as<typename Impl::BVecType>()[iKIter],
+                   reinterpret_cast<const buf_a&>(a_vec)
+                       .template get_as<typename Impl::AVecType>()[iKIter]);
+        });
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        constexpr auto I0 = number<0>{};
+        using buf_a       = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b       = thread_buffer<typename Impl::BVecType, kKIter>;
+
+        // swap A and B, value and type
+        auto c_vec = Impl{}(
+            reinterpret_cast<const buf_b&>(b_vec).template get_as<typename Impl::BVecType>()[I0],
+            reinterpret_cast<const buf_a&>(a_vec).template get_as<typename Impl::AVecType>()[I0]);
+
+        static_for<1, kKIter, 1>{}([&](auto iKIter) {
+            Impl{}(c_vec,
+                   reinterpret_cast<const buf_b&>(b_vec)
+                       .template get_as<typename Impl::BVecType>()[iKIter],
+                   reinterpret_cast<const buf_a&>(a_vec)
+                       .template get_as<typename Impl::AVecType>()[iKIter]);
+        });
+
+        return c_vec;
+    }
+};
+
+template <typename WarpGemmAttributeMfmaImpl_, index_t kKIter, index_t SFactor_ = 2>
+struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
+{
+    using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+
+    // swap A and B
+    using ADataType = typename Impl::BDataType;
+    using BDataType = typename Impl::ADataType;
+    using CDataType = typename Impl::CDataType;
+
+    using AVecType =
+        ext_vector_t<ADataType, vector_traits<typename Impl::AVecType>::vector_size * kKIter>;
+    using BVecType =
+        ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
+    using CVecType = typename Impl::CVecType;
+
+    static constexpr index_t kM      = Impl::kN;
+    static constexpr index_t kN      = Impl::kM;
+    static constexpr index_t kK      = Impl::kK * kKIter;
+    static constexpr index_t SFactor = SFactor_; // group how many CM1 together
+
+    using AWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+#if 0
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane / (Impl::kABKPerLane * Impl::kABKLane * 2),
+                       Impl::kABKLane,
+                       2,
+                       Impl::kABKPerLane>,
+              sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+        tuple<sequence<2, 1, 1, 1, 1>>,
+        tuple<sequence<0, 0, 2, 1, 3>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCNLane>,
+              sequence<Impl::kCM0PerLane / 2, Impl::kCMLane, Impl::kCM1PerLane * 2>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<1, 0>>,
+        sequence<2, 2>,
+        sequence<0, 2>>;
+#else
+    // TODO: more test not only 32x32
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane / (Impl::kCMLane * SFactor * Impl::kCM1PerLane),
+                       Impl::kCMLane,
+                       SFactor,
+                       Impl::kCM1PerLane>,
+              sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+        tuple<sequence<2, 1, 1, 1, 1>>,
+        tuple<sequence<0, 0, 2, 1, 3>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCNLane>,
+              sequence<Impl::kCM0PerLane / SFactor, Impl::kCMLane, Impl::kCM1PerLane * SFactor>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<1, 0>>,
+        sequence<2, 2>,
+        sequence<0, 2>>;
+#endif
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+        // swap A and B, value and type
+        static_for<0, kKIter, 1>{}([&](auto iKIter) {
+            Impl{}(c_vec,
+                   reinterpret_cast<const buf_b&>(b_vec)
+                       .template get_as<typename Impl::BVecType>()[iKIter],
+                   reinterpret_cast<const buf_a&>(a_vec)
+                       .template get_as<typename Impl::AVecType>()[iKIter]);
+        });
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        using buf_a       = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b       = thread_buffer<typename Impl::BVecType, kKIter>;
+        constexpr auto I0 = number<0>{};
+
+        // swap A and B, value and type
+        auto c_vec = Impl{}(
+            reinterpret_cast<const buf_b&>(b_vec).template get_as<typename Impl::BVecType>()[I0],
+            reinterpret_cast<const buf_a&>(a_vec).template get_as<typename Impl::AVecType>()[I0]);
+
+        static_for<1, kKIter, 1>{}([&](auto iKIter) {
+            Impl{}(c_vec,
+                   reinterpret_cast<const buf_b&>(b_vec)
+                       .template get_as<typename Impl::BVecType>()[iKIter],
+                   reinterpret_cast<const buf_a&>(a_vec)
+                       .template get_as<typename Impl::AVecType>()[iKIter]);
+        });
+
+        return c_vec;
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+// FP16
+struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
+{
+    using ADataType = fp16_t;
+    using BDataType = fp16_t;
+    using CDataType = float;
+
+    using AVecType = ext_vector_t<fp16_t, 4>;
+    using BVecType = ext_vector_t<fp16_t, 4>;
+    using CVecType = ext_vector_t<float, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 8;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 4;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__)
+        c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0);
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+#endif
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
+{
+    using ADataType = fp16_t;
+    using BDataType = fp16_t;
+    using CDataType = float;
+
+    using AVecType = ext_vector_t<fp16_t, 4>;
+    using BVecType = ext_vector_t<fp16_t, 4>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 4;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__)
+        c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0);
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+#endif
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+// Bf16
+struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
+{
+    using ADataType = bf16_t;
+    using BDataType = bf16_t;
+    using CDataType = float;
+
+    using AVecType = ext_vector_t<bf16_t, 4>;
+    using BVecType = ext_vector_t<bf16_t, 4>;
+    using CVecType = ext_vector_t<float, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 8;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 4;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+#elif defined(__gfx908__)
+        static_for<0, 2, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16(
+                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+#endif
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0));
+#elif defined(__gfx908__)
+        CVecType c_vec{0.f};
+        static_for<0, 2, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16(
+                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
+{
+    using ADataType = bf16_t;
+    using BDataType = bf16_t;
+    using CDataType = float;
+
+    using AVecType = ext_vector_t<bf16_t, 4>;
+    using BVecType = ext_vector_t<bf16_t, 4>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 4;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        c_vec = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+#elif defined(__gfx908__)
+        static_for<0, 2, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_16x16x8bf16(
+                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+#endif
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#elif defined(__gfx908__)
+        CVecType c_vec{0.f};
+        static_for<0, 2, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_16x16x8bf16(
+                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+// FP8
+template <typename AType_, typename BType_>
+struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
+{
+    using ADataType = AType_;
+    using BDataType = BType_;
+    using CDataType = float;
+
+    using AVecType = ext_vector_t<ADataType, 8>;
+    using BVecType = ext_vector_t<BDataType, 8>;
+    using CVecType = ext_vector_t<CDataType, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+#elif defined(__gfx908__) || defined(__gfx90a__)
+        static_for<0, 8, 1>{}([&](auto k) {
+            float a_f32 =
+                type_convert<float>(reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                                        .template get_as<ADataType>()[number<k>{}]);
+            float b_f32 =
+                type_convert<float>(reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                                        .template get_as<BDataType>()[number<k>{}]);
+
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0);
+        });
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+#endif
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+#elif defined(__gfx908__) || defined(__gfx90a__)
+        CVecType c_vec{0.f};
+        static_for<0, 8, 1>{}([&](auto k) {
+            float a_f32 =
+                type_convert<float>(reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                                        .template get_as<ADataType>()[number<k>{}]);
+            float b_f32 =
+                type_convert<float>(reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                                        .template get_as<BDataType>()[number<k>{}]);
+
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0);
+        });
+        return c_vec;
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8 =
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t>;
+using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8 =
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, bf8_t>;
+using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8 =
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, fp8_t>;
+using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8 =
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, bf8_t>;
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+
+namespace ck_tile {
+
+namespace impl {
+template <typename AType,
+          typename BType,
+          typename CType,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t KPerWave,
+          bool TransposeC>
+struct WarpGemmMfmaDispatcher;
+
+// clang-format off
+// fp16
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; };
+
+// bf16
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; };
+
+// fp8
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
+
+// clang-format on
+} // namespace impl
+
+template <typename AType,
+          typename BType,
+          typename CType,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t KPerWave,
+          bool TransposeC>
+using WarpGemmMfmaDispatcher = typename impl::
+    WarpGemmMfmaDispatcher<AType, BType, CType, MPerWave, NPerWave, KPerWave, TransposeC>::Type;
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+
+template <typename WarpGemmAttribute_>
+struct WarpGemmImpl
+{
+    using WarpGemmAttribute = remove_cvref_t<WarpGemmAttribute_>;
+
+    static constexpr index_t kM = WarpGemmAttribute::kM;
+    static constexpr index_t kN = WarpGemmAttribute::kN;
+    static constexpr index_t kK = WarpGemmAttribute::kK;
+
+    using ADataType = typename WarpGemmAttribute::ADataType;
+    using BDataType = typename WarpGemmAttribute::BDataType;
+    using CDataType = typename WarpGemmAttribute::CDataType;
+
+    using AWarpDstrEncoding = typename WarpGemmAttribute::AWarpDstrEncoding;
+    using BWarpDstrEncoding = typename WarpGemmAttribute::BWarpDstrEncoding;
+    using CWarpDstrEncoding = typename WarpGemmAttribute::CWarpDstrEncoding;
+
+    using AWarpDstr = remove_cvref_t<decltype(make_static_tile_distribution(AWarpDstrEncoding{}))>;
+    using BWarpDstr = remove_cvref_t<decltype(make_static_tile_distribution(BWarpDstrEncoding{}))>;
+    using CWarpDstr = remove_cvref_t<decltype(make_static_tile_distribution(CWarpDstrEncoding{}))>;
+
+    using AWarpTensor = static_distributed_tensor<ADataType, AWarpDstr>;
+    using BWarpTensor = static_distributed_tensor<BDataType, BWarpDstr>;
+    using CWarpTensor = static_distributed_tensor<CDataType, CWarpDstr>;
+
+    CK_TILE_DEVICE void operator()(CWarpTensor& c, const AWarpTensor& a, const BWarpTensor& b) const
+    {
+        using AVec = ext_vector_t<ADataType, AWarpTensor::get_thread_buffer_size()>;
+        using BVec = ext_vector_t<BDataType, BWarpTensor::get_thread_buffer_size()>;
+        using CVec = ext_vector_t<CDataType, CWarpTensor::get_thread_buffer_size()>;
+
+        constexpr auto I0 = number<0>{};
+
+        const auto a_vec = a.get_thread_buffer().template get_as<AVec>()[I0];
+        const auto b_vec = b.get_thread_buffer().template get_as<BVec>()[I0];
+        auto c_vec       = c.get_thread_buffer().template get_as<CVec>()[I0];
+
+        // c_vec += a_vec * b_vec
+        WarpGemmAttribute{}(c_vec, a_vec, b_vec);
+
+        c.get_thread_buffer().template set_as<CVec>(I0, c_vec);
+    }
+
+    CK_TILE_DEVICE auto operator()(const AWarpTensor& a, const BWarpTensor& b) const
+    {
+        CWarpTensor c;
+
+        using AVec = ext_vector_t<ADataType, AWarpTensor::get_thread_buffer_size()>;
+        using BVec = ext_vector_t<BDataType, BWarpTensor::get_thread_buffer_size()>;
+        using CVec = ext_vector_t<CDataType, CWarpTensor::get_thread_buffer_size()>;
+
+        constexpr auto I0 = number<0>{};
+
+        const auto a_vec = a.get_thread_buffer().template get_as<AVec>()[I0];
+        const auto b_vec = b.get_thread_buffer().template get_as<BVec>()[I0];
+
+        // c_vec = a_vec * b_vec
+        auto c_vec = WarpGemmAttribute{}(a_vec, b_vec);
+
+        c.get_thread_buffer().template set_as<CVec>(I0, c_vec);
+
+        return c;
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/reduce.hpp
+++ b/include/ck_tile/ops/reduce.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+// synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
+template <typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true>
+CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
+                                           const ReduceFunc& reduce_func,
+                                           bool_constant<WithBroadcast> = {})
+{
+    using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
+    using DstrEncode       = typename Dstr::DstrEncode;
+    using DstrEncodeDetail = typename DstrEncode::detail;
+
+    constexpr index_t NDimP = Dstr::get_num_of_dimension_p();
+    constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
+
+    constexpr index_t idim_p_lane = NDimP - 1;
+
+    const auto ps_idx = make_array<index_t>(get_block_id(), get_lane_id());
+    const auto rs_idx = acc_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
+
+    constexpr index_t thread_buf_size = AccDistributedTensor_::get_thread_buffer_size();
+
+    // loop over thread data
+    static_for<0, thread_buf_size, 1>{}([&](auto i) {
+        auto v_local = acc_tensor.get_thread_buffer()[i];
+
+        // cross-lane reduce for replication
+        // only reduce on R dimension correspond to lane
+        // (lane id maps to this R dimension)
+        static_for<0, NDimR, 1>{}([&](auto idim_r) {
+            // FIXME: nasty to use does_p_own_r_
+            if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r])
+            {
+                constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
+
+                constexpr index_t lid_over_rid_derivative =
+                    DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r];
+
+                static_assert(is_power_of_two_integer(r_length),
+                              "wrong! only support power of 2 reduction");
+
+                constexpr index_t nstage = integer_log2_floor(r_length);
+
+                // reduction sweep forward
+                static_for<0, nstage, 1>{}([&](auto istage) {
+                    constexpr index_t lid_delta =
+                        lid_over_rid_derivative * (1 << (nstage - istage - 1));
+
+                    // pull data from remote lane
+                    const auto v_remote = warp_shuffle_down(v_local, lid_delta);
+
+                    // reduce
+                    v_local = reduce_func(v_local, v_remote);
+                });
+            }
+        });
+
+        if constexpr(WithBroadcast)
+        {
+            // cross-lane broadcast for replication
+            // only broadcast on R dimension correspond to lane
+            // (lane id maps to this R dimension)
+            static_for<0, NDimR, 1>{}([&](auto idim_r) {
+                // FIXME: nasty to use does_p_own_r_
+                if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r])
+                {
+                    const index_t r_id = rs_idx[idim_r];
+
+                    constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
+
+                    constexpr index_t lid_over_rid_derivative =
+                        DstrEncodeDetail::ps_over_rs_derivative_[NDimP - 1][idim_r];
+
+                    static_assert(is_power_of_two_integer(r_length),
+                                  "wrong! only support power of 2 reduction");
+
+                    constexpr index_t nstage = integer_log2_floor(r_length);
+
+                    // broadcast sweep backward
+                    static_for<0, nstage, 1>{}([&](auto istage) {
+                        // do I hold reduced data?
+                        const bool do_i_hold_reduced_data = r_id < (1 << istage);
+
+                        constexpr index_t lid_delta = lid_over_rid_derivative * (1 << istage);
+
+                        // pull data from remote lane
+                        const auto v_remote = warp_shuffle_up(v_local, lid_delta);
+
+                        // decide whether to update local data with remote data
+                        v_local = do_i_hold_reduced_data ? v_local : v_remote;
+                    });
+                }
+            });
+        }
+
+        acc_tensor.get_thread_buffer()(i) = v_local;
+    });
+}
+
+// FIXME: this is for 2D to 1D reduce only, need to support n-D
+template <typename AccDistributedTensor_,
+          typename InDistributedTensor_,
+          index_t... InReduceDims,
+          typename ReduceFunc>
+CK_TILE_DEVICE void block_tile_reduce(AccDistributedTensor_& acc_tensor,
+                                      const InDistributedTensor_& in_tensor,
+                                      sequence<InReduceDims...>,
+                                      const ReduceFunc& reduce_func)
+{
+    constexpr auto I0 = number<0>{};
+    constexpr auto I1 = number<1>{};
+
+#if 0
+    constexpr auto in_reduce_dims = sequence<InReduceDims...>{};
+
+    constexpr index_t ndim_in        = InDistributedTensor_::get_num_of_dimension();
+    constexpr index_t ndim_in_reduce = in_reduce_dims.size();
+    constexpr index_t ndim_in_free   = ndim_in - ndim_in_reduce;
+
+    constexpr auto in_free_dims_arr = [&] {
+        array<bool, ndim_free> is_free_dims{true};
+
+        for(index_t i = 0; i < ndim_reduce; i++)
+        {
+            is_free_dims(in_reduce_dims[i]) = false;
+        }
+
+        array<index_t, ndim_free> in_free_dims{-1};
+
+        index_t cnt = 0;
+
+        for(index_t i = 0; i < ndim_in; i++)
+        {
+            if(is_free_dims[i])
+            {
+                in_free_dims(cnt) = i;
+
+                cnt++
+            }
+        }
+
+        return is_free_dims;
+    }();
+
+    constexpr auto in_free_dims = TO_SEQUENCE(is_free_dims_arr, ndim_in_free);
+#else
+
+    constexpr auto spans = InDistributedTensor_::get_distributed_spans();
+
+    // in-thread reduction
+    // FIXME: hard coded to be 2D to 1D reduction
+    sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) {
+        constexpr auto acc_dstr_idx = make_tuple(dstr_idx_i0);
+
+        auto acc = acc_tensor[acc_dstr_idx];
+
+        // FIXME
+        sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) {
+            constexpr auto in_dstr_idx = make_tuple(dstr_idx_i0, dstr_idx_i1);
+
+            const auto in = in_tensor[in_dstr_idx];
+
+            acc = reduce_func(acc, in);
+        });
+
+        acc_tensor(acc_dstr_idx) = acc;
+    });
+#endif
+}
+
+template <typename AccDataType_,
+          typename InDistributedTensor_,
+          index_t... InReduceDims,
+          typename ReduceFunc,
+          typename InDataType_>
+CK_TILE_DEVICE auto block_tile_reduce(const InDistributedTensor_& in_tensor,
+                                      sequence<InReduceDims...> in_reduce_dims,
+                                      const ReduceFunc& reduce_func,
+                                      const InDataType_& reduce_init)
+{
+    using InDataType  = typename InDistributedTensor_::DataType;
+    using AccDataType = remove_cvref_t<AccDataType_>;
+
+    static_assert(std::is_same_v<InDataType, remove_cvref_t<InDataType_>>, "wrong!");
+
+    // declare acc_tensor
+    constexpr auto acc_dstr =
+        make_static_tile_distribution(ck_tile::detail::make_reduce_tile_distribution_encoding(
+            InDistributedTensor_::get_tile_distribution().get_static_tile_distribution_encoding(),
+            sequence<InReduceDims...>{}));
+
+    auto acc_tensor = make_static_distributed_tensor<AccDataType>(acc_dstr);
+
+    // init acc_tensor
+    tile_elementwise_inout([&](auto& acc) { acc = type_convert<AccDataType>(reduce_init); },
+                           acc_tensor);
+
+    // warp reduce
+    block_tile_reduce(acc_tensor, in_tensor, in_reduce_dims, reduce_func);
+
+    return acc_tensor;
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/remod.py
+++ b/include/ck_tile/remod.py
+import pathlib
+from pathlib import Path
+import subprocess
+import os
+import copy
+
+NS = 'ck_tile'
+OPS = 'ops'
+OPS_COMMON = 'common' # common header will be duplicated into ops/* other module
+
+HEADER_COMMON = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
+"""
+
+# aa/bb/cc/file.hpp -> (aa, bb, cc, file.hpp)
+def get_module(f, level = 0):
+    all_parts = f.parts
+    return str(all_parts[level])
+
+all_files = []
+for p in sorted(Path("./").rglob("*")):
+    if p.suffix == '.hpp':
+        all_files.append(pathlib.PurePath(p))
+
+class submodule_t:
+    def __init__(self):
+        self.m = dict()
+    def push(self, f):
+        if len(f.parents) != 1: # ignore ./xxx.hpp
+            mod = get_module(f)
+            if mod == OPS:
+                if mod not in self.m.keys():
+                    self.m[mod] = dict()
+                mod2 = get_module(f, 1)
+                if Path(mod2).suffix != '.hpp':
+                    # ignore ops/xxx.hpp
+                    if mod2 not in self.m[mod].keys():
+                        self.m[mod][mod2] = list()
+                    self.m[mod][mod2].append(f)
+            else:
+                if mod not in self.m.keys():
+                    self.m[mod] = list()
+                self.m[mod].append(f)
+
+    def gen(self):
+        def gen_header(hpath, include_list):
+            # print(hpath)
+            if os.path.exists(str(hpath)):
+                os.remove(str(hpath))
+            with hpath.open('w') as f:
+                f.write(HEADER_COMMON)
+                f.write('#pragma once\n')
+                f.write('\n')
+                for individual_header in include_list:
+                    header_path = NS + '/' + str(individual_header)
+                    f.write(f'#include \"{header_path}\"\n')
+                # f.write('\n') # otherwise clang-format will complain
+        # print(self.m)
+        # restructure common
+        for k, v in self.m.items():
+            if k == OPS and OPS_COMMON in v.keys():
+                common_list = copy.deepcopy(v[OPS_COMMON])
+                # v.pop(OPS_COMMON)
+                for km in v.keys():
+                    if km != OPS_COMMON:
+                        v[km].extend(common_list)
+
+        for k, v in self.m.items():
+            if k == OPS:
+                for km, kv in v.items():
+                    gen_header(Path(k) / (f'{km}.hpp'), kv)
+            else:
+                gen_header(Path(f'{k}.hpp'), v)
+            
+
+submodule = submodule_t()
+# formatting
+for x in all_files:
+    subprocess.Popen(f'dos2unix {str(x)}', shell=True)
+    cmd = f'clang-format-12 -style=file -i {str(x)}'
+    #for xp in x.parents:
+    #print(get_file_base(x))
+    subprocess.Popen(cmd, shell=True)
+    submodule.push(x)
+
+submodule.gen()
+
+#print(all_files)
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multi_abd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multi_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Scales      = ck::tensor_operation::element_wise::Scales;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+#ifdef CK_ENABLE_INT8
+// RRR
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Row>,
+                                                      ck::Tuple<Row, Row>,
+                                                      ck::Tuple<Row>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<BF16>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      AddFastGelu>>>& instances);
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Row>,
+                                                      ck::Tuple<Row, Row>,
+                                                      ck::Tuple<Row>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<BF16>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      Add>>>& instances);
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Row>,
+                                                      ck::Tuple<Row, Row>,
+                                                      ck::Tuple<>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      FastGelu>>>& instances);
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Row>,
+                                                      ck::Tuple<Row, Row>,
+                                                      ck::Tuple<>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      PassThrough>>>& instances);
+
+// RCR
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Row>,
+                                                      ck::Tuple<Col, Col>,
+                                                      ck::Tuple<Row>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<BF16>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      AddFastGelu>>>& instances);
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Row>,
+                                                      ck::Tuple<Col, Col>,
+                                                      ck::Tuple<Row>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<BF16>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      Add>>>& instances);
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Row>,
+                                                      ck::Tuple<Col, Col>,
+                                                      ck::Tuple<>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      FastGelu>>>& instances);
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Row>,
+                                                      ck::Tuple<Col, Col>,
+                                                      ck::Tuple<>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      PassThrough>>>& instances);
+
+// CRR
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Col>,
+                                                      ck::Tuple<Row, Row>,
+                                                      ck::Tuple<Row>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<BF16>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      AddFastGelu>>>& instances);
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Col>,
+                                                      ck::Tuple<Row, Row>,
+                                                      ck::Tuple<Row>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<BF16>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      Add>>>& instances);
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Col>,
+                                                      ck::Tuple<Row, Row>,
+                                                      ck::Tuple<>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      FastGelu>>>& instances);
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Col>,
+                                                      ck::Tuple<Row, Row>,
+                                                      ck::Tuple<>,
+                                                      Row,
+                                                      ck::Tuple<BF16>,
+                                                      ck::Tuple<I8, BF16>,
+                                                      ck::Tuple<>,
+                                                      BF16,
+                                                      PassThrough,
+                                                      Scales,
+                                                      PassThrough>>>& instances);
+#endif
+
+// GEMM + Add + Gelu
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABD<AsLayout,
+                                                        BsLayout,
+                                                        DsLayout,
+                                                        ELayout,
+                                                        AsDataType,
+                                                        BsDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        PassThrough,
+                                                        Scales,
+                                                        AddFastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleABD<AsLayout,
+                                           BsLayout,
+                                           DsLayout,
+                                           ELayout,
+                                           AsDataType,
+                                           BsDataType,
+                                           DsDataType,
+                                           EDataType,
+                                           PassThrough,
+                                           Scales,
+                                           AddFastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_v1_instances(op_ptrs);
+            }
+        }
+#endif
+
+        return op_ptrs;
+    }
+};
+
+// GEMM + Add
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABD<AsLayout,
+                                                        BsLayout,
+                                                        DsLayout,
+                                                        ELayout,
+                                                        AsDataType,
+                                                        BsDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        PassThrough,
+                                                        Scales,
+                                                        Add>>
+{
+    using DeviceOp = DeviceGemmMultipleABD<AsLayout,
+                                           BsLayout,
+                                           DsLayout,
+                                           ELayout,
+                                           AsDataType,
+                                           BsDataType,
+                                           DsDataType,
+                                           EDataType,
+                                           PassThrough,
+                                           Scales,
+                                           Add>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_v1_instances(op_ptrs);
+            }
+        }
+#endif
+
+        return op_ptrs;
+    }
+};
+
+// GEMM + Gelu
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABD<AsLayout,
+                                                        BsLayout,
+                                                        DsLayout,
+                                                        ELayout,
+                                                        AsDataType,
+                                                        BsDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        PassThrough,
+                                                        Scales,
+                                                        FastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleABD<AsLayout,
+                                           BsLayout,
+                                           DsLayout,
+                                           ELayout,
+                                           AsDataType,
+                                           BsDataType,
+                                           DsDataType,
+                                           EDataType,
+                                           PassThrough,
+                                           Scales,
+                                           FastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_v1_instances(op_ptrs);
+            }
+        }
+#endif
+
+        return op_ptrs;
+    }
+};
+
+// GEMM
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABD<AsLayout,
+                                                        BsLayout,
+                                                        DsLayout,
+                                                        ELayout,
+                                                        AsDataType,
+                                                        BsDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        PassThrough,
+                                                        Scales,
+                                                        PassThrough>>
+{
+    using DeviceOp = DeviceGemmMultipleABD<AsLayout,
+                                           BsLayout,
+                                           DsLayout,
+                                           ELayout,
+                                           AsDataType,
+                                           BsDataType,
+                                           DsDataType,
+                                           EDataType,
+                                           PassThrough,
+                                           Scales,
+                                           PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_v1_instances(op_ptrs);
+            }
+        }
+#endif
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multi_abd_fixed_nk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multi_abd_fixed_nk.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd_fixed_nk.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Scales      = ck::tensor_operation::element_wise::Scales;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+// RRR
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Row>,
+                                                                 ck::Tuple<Row, Row>,
+                                                                 ck::Tuple<Row>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<BF16>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 AddFastGelu>>>& instances);
+
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Row>,
+                                                                 ck::Tuple<Row, Row>,
+                                                                 ck::Tuple<Row>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<BF16>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 Add>>>& instances);
+
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Row>,
+                                                                 ck::Tuple<Row, Row>,
+                                                                 ck::Tuple<>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 FastGelu>>>& instances);
+
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Row>,
+                                                                 ck::Tuple<Row, Row>,
+                                                                 ck::Tuple<>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 PassThrough>>>& instances);
+
+// RCR
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Row>,
+                                                                 ck::Tuple<Col, Col>,
+                                                                 ck::Tuple<Row>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<BF16>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 AddFastGelu>>>& instances);
+
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Row>,
+                                                                 ck::Tuple<Col, Col>,
+                                                                 ck::Tuple<Row>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<BF16>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 Add>>>& instances);
+
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Row>,
+                                                                 ck::Tuple<Col, Col>,
+                                                                 ck::Tuple<>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 FastGelu>>>& instances);
+
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Row>,
+                                                                 ck::Tuple<Col, Col>,
+                                                                 ck::Tuple<>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 PassThrough>>>& instances);
+
+// CRR
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Col>,
+                                                                 ck::Tuple<Row, Row>,
+                                                                 ck::Tuple<Row>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<BF16>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 AddFastGelu>>>& instances);
+
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_bias_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Col>,
+                                                                 ck::Tuple<Row, Row>,
+                                                                 ck::Tuple<Row>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<BF16>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 Add>>>& instances);
+
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Col>,
+                                                                 ck::Tuple<Row, Row>,
+                                                                 ck::Tuple<>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 FastGelu>>>& instances);
+
+void add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmMultiABDFixedNK<ck::Tuple<Col>,
+                                                                 ck::Tuple<Row, Row>,
+                                                                 ck::Tuple<>,
+                                                                 Row,
+                                                                 ck::Tuple<BF16>,
+                                                                 ck::Tuple<I8, BF16>,
+                                                                 ck::Tuple<>,
+                                                                 BF16,
+                                                                 PassThrough,
+                                                                 Scales,
+                                                                 PassThrough>>>& instances);
+
+// GEMM + Add + Gelu
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGroupedGemmMultiABDFixedNK<AsLayout,
+                                                                   BsLayout,
+                                                                   DsLayout,
+                                                                   ELayout,
+                                                                   AsDataType,
+                                                                   BsDataType,
+                                                                   DsDataType,
+                                                                   EDataType,
+                                                                   PassThrough,
+                                                                   Scales,
+                                                                   AddFastGelu>>
+{
+    using DeviceOp = DeviceGroupedGemmMultiABDFixedNK<AsLayout,
+                                                      BsLayout,
+                                                      DsLayout,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      DsDataType,
+                                                      EDataType,
+                                                      PassThrough,
+                                                      Scales,
+                                                      AddFastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_instances(
+                    op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_instances(
+                    op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_instances(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+// GEMM + Add
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGroupedGemmMultiABDFixedNK<AsLayout,
+                                                                   BsLayout,
+                                                                   DsLayout,
+                                                                   ELayout,
+                                                                   AsDataType,
+                                                                   BsDataType,
+                                                                   DsDataType,
+                                                                   EDataType,
+                                                                   PassThrough,
+                                                                   Scales,
+                                                                   Add>>
+{
+    using DeviceOp = DeviceGroupedGemmMultiABDFixedNK<AsLayout,
+                                                      BsLayout,
+                                                      DsLayout,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      DsDataType,
+                                                      EDataType,
+                                                      PassThrough,
+                                                      Scales,
+                                                      Add>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_instances(
+                    op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_bias_instances(
+                    op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_instances(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+// GEMM + Gelu
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGroupedGemmMultiABDFixedNK<AsLayout,
+                                                                   BsLayout,
+                                                                   DsLayout,
+                                                                   ELayout,
+                                                                   AsDataType,
+                                                                   BsDataType,
+                                                                   DsDataType,
+                                                                   EDataType,
+                                                                   PassThrough,
+                                                                   Scales,
+                                                                   FastGelu>>
+{
+    using DeviceOp = DeviceGroupedGemmMultiABDFixedNK<AsLayout,
+                                                      BsLayout,
+                                                      DsLayout,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      DsDataType,
+                                                      EDataType,
+                                                      PassThrough,
+                                                      Scales,
+                                                      FastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_instances(
+                    op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_instances(
+                    op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_instances(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+// GEMM
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGroupedGemmMultiABDFixedNK<AsLayout,
+                                                                   BsLayout,
+                                                                   DsLayout,
+                                                                   ELayout,
+                                                                   AsDataType,
+                                                                   BsDataType,
+                                                                   DsDataType,
+                                                                   EDataType,
+                                                                   PassThrough,
+                                                                   Scales,
+                                                                   PassThrough>>
+{
+    using DeviceOp = DeviceGroupedGemmMultiABDFixedNK<AsLayout,
+                                                      BsLayout,
+                                                      DsLayout,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      DsDataType,
+                                                      EDataType,
+                                                      PassThrough,
+                                                      Scales,
+                                                      PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_instances(
+                    op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_instances(
+                    op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_instances(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/CMakeLists.txt
+# ONLY XDL_KERNELS
+set(GEMM_MULTI_ABD_INSTANCES)
+
+list(APPEND GEMM_MULTI_ABD_INSTANCES 
+	device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+	device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
+	device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_km_kn_mn_v1_instance.cpp
+	)
+
+add_instance_library(device_gemm_multi_abd_instance ${GEMM_MULTI_ABD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using BsDataType       = ck::Tuple<B0DataType, B1DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using D0DataType       = BF16;
+// using DsDataType       = ck::Tuple<D0DataType>;
+using EDataType = BF16;
+
+using A0Layout = Col;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Row;
+using B1Layout = B0Layout;
+using BsLayout = ck::Tuple<B0Layout, B1Layout>;
+using D0Layout = Row;
+// using DsLayout = ck::Tuple<D0Layout>;
+using ELayout = Row;
+
+using Scales      = ck::tensor_operation::element_wise::Scales;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using AElementOp = PassThrough;
+using BElementOp = Scales;
+// using CDEElementOp = AddFastGelu;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding  = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+template <typename DsLayout,
+          typename DsDataType,
+          typename CDEElementOp,
+          ck::tensor_operation::device::GemmSpecialization GemmSpec,
+          ck::PipelineVersion PipVer,
+          ck::LoopScheduler LoopSche>
+using device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //###############################|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer| K0Per| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //###############################|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //###############################|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###############################|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        //PipelineVersion::v1
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 48, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 24, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,    64,    32,    32,    32,   8,   8,   32,   32,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,    64,    16,    32,    32,   8,   8,   16,   16,    1,    2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               4, LoopSche, PipVer>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using BsDataType       = ck::Tuple<B0DataType, B1DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using D0DataType       = BF16;
+// using DsDataType       = ck::Tuple<D0DataType>;
+using EDataType = BF16;
+
+using A0Layout = Row;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Row;
+using B1Layout = B0Layout;
+using BsLayout = ck::Tuple<B0Layout, B1Layout>;
+using D0Layout = Row;
+// using DsLayout = ck::Tuple<D0Layout>;
+using ELayout = Row;
+
+using Scales      = ck::tensor_operation::element_wise::Scales;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using AElementOp = PassThrough;
+using BElementOp = Scales;
+// using CDEElementOp = AddFastGelu;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding  = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+template <typename DsLayout,
+          typename DsDataType,
+          typename CDEElementOp,
+          ck::tensor_operation::device::GemmSpecialization GemmSpec,
+          ck::PipelineVersion PipVer,
+          ck::LoopScheduler LoopSche>
+using device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //###############################|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer| K0Per| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //###############################|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //###############################|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###############################|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        //PipelineVersion::v1
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 48, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 24, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,    64,    32,    32,    32,   8,   8,   32,   32,    1,    1,     S<2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,    64,    16,    32,    32,   8,   8,   16,   16,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               4, LoopSche, PipVer>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using BsDataType       = ck::Tuple<B0DataType, B1DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using D0DataType       = BF16;
+// using DsDataType       = ck::Tuple<D0DataType>;
+using EDataType = BF16;
+
+using A0Layout = Row;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Col;
+using B1Layout = B0Layout;
+using BsLayout = ck::Tuple<B0Layout, B1Layout>;
+using D0Layout = Row;
+// using DsLayout = ck::Tuple<D0Layout>;
+using ELayout = Row;
+
+using Scales      = ck::tensor_operation::element_wise::Scales;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using AElementOp = PassThrough;
+using BElementOp = Scales;
+// using CDEElementOp = AddFastGelu;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding  = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+template <typename DsLayout,
+          typename DsDataType,
+          typename CDEElementOp,
+          ck::tensor_operation::device::GemmSpecialization GemmSpec,
+          ck::PipelineVersion PipVer,
+          ck::LoopScheduler LoopSche>
+using device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //###############################|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer| K0Per| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //###############################|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //###############################|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###############################|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        //PipelineVersion::v1
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 48, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 24, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,    64,    32,    32,    32,   8,   8,   32,   32,    1,    1,     S<2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8, LoopSche, PipVer>,
+       DeviceGemmMultipleABD_Xdl_CShuffle< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,    64,    16,    32,    32,   8,   8,   16,   16,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               4, LoopSche, PipVer>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_km_kn_mn_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_km_kn_mn_v1_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+
+#include "device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<D0Layout>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<D0DataType>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_instances<ck::Tuple<D0Layout>,
+                                                                  ck::Tuple<D0DataType>,
+                                                                  AddFastGelu,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<D0Layout>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<D0DataType>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      Add>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_instances<ck::Tuple<D0Layout>,
+                                                                  ck::Tuple<D0DataType>,
+                                                                  Add,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_instances<ck::Tuple<>,
+                                                                  ck::Tuple<>,
+                                                                  PassThrough,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_instances<ck::Tuple<>,
+                                                                  ck::Tuple<>,
+                                                                  FastGelu,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+
+#include "device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<D0Layout>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<D0DataType>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_instances<ck::Tuple<D0Layout>,
+                                                                  ck::Tuple<D0DataType>,
+                                                                  AddFastGelu,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<D0Layout>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<D0DataType>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      Add>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_instances<ck::Tuple<D0Layout>,
+                                                                  ck::Tuple<D0DataType>,
+                                                                  Add,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_instances<ck::Tuple<>,
+                                                                  ck::Tuple<>,
+                                                                  PassThrough,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_instances<ck::Tuple<>,
+                                                                  ck::Tuple<>,
+                                                                  FastGelu,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+
+#include "device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<D0Layout>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<D0DataType>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_instances<ck::Tuple<D0Layout>,
+                                                                  ck::Tuple<D0DataType>,
+                                                                  AddFastGelu,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<D0Layout>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<D0DataType>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      Add>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_instances<ck::Tuple<D0Layout>,
+                                                                  ck::Tuple<D0DataType>,
+                                                                  Add,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_instances<ck::Tuple<>,
+                                                                  ck::Tuple<>,
+                                                                  PassThrough,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABD<AsLayout,
+                                                      BsLayout,
+                                                      ck::Tuple<>,
+                                                      ELayout,
+                                                      AsDataType,
+                                                      BsDataType,
+                                                      ck::Tuple<>,
+                                                      EDataType,
+                                                      AElementOp,
+                                                      BElementOp,
+                                                      FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_instances<ck::Tuple<>,
+                                                                  ck::Tuple<>,
+                                                                  FastGelu,
+                                                                  GemmMNKPadding,
+                                                                  PipelineVersion::v1,
+                                                                  LoopScheduler::Default>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk_multi_abd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk_multi_abd/CMakeLists.txt
+# ONLY XDL_KERNELS
+set(GROUPED_GEMM_FIXED_NK_MULTI_ABD_INSTANCES)
+
+list(APPEND GROUPED_GEMM_FIXED_NK_MULTI_ABD_INSTANCES 
+	device_grouped_gemm_xdl_fixed_nk_bias_gelu_bf16_i8_bf16_mk_kn_mn_instance.cpp
+	device_grouped_gemm_xdl_fixed_nk_bias_gelu_bf16_i8_bf16_mk_nk_mn_instance.cpp
+	device_grouped_gemm_xdl_fixed_nk_bias_gelu_bf16_i8_bf16_km_kn_mn_instance.cpp
+	)
+
+add_instance_library(device_grouped_gemm_fixed_nk_multi_abd_instance ${GROUPED_GEMM_FIXED_NK_MULTI_ABD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk_multi_abd/device_grouped_gemm_xdl_fixed_nk_bf16_i8_bf16_km_kn_mn_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk_multi_abd/device_grouped_gemm_xdl_fixed_nk_bf16_i8_bf16_km_kn_mn_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using BsDataType       = ck::Tuple<B0DataType, B1DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using D0DataType       = BF16;
+// using DsDataType       = ck::Tuple<D0DataType>;
+using EDataType = BF16;
+
+using A0Layout = Col;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Row;
+using B1Layout = B0Layout;
+using BsLayout = ck::Tuple<B0Layout, B1Layout>;
+using D0Layout = Row;
+// using DsLayout = ck::Tuple<Row>;
+using ELayout = Row;
+
+using Scales      = ck::tensor_operation::element_wise::Scales;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+using Add         = ck::tensor_operation::element_wise::Add;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+
+using AElementOp = PassThrough;
+using BElementOp = Scales;
+// using CDEElementOp = AddFastGelu;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding  = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename DsLayout,
+          typename DsDataType,
+          typename CDEElementOp,
+          ck::tensor_operation::device::GemmSpecialization GemmSpec>
+using device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //######################################|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM|NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //######################################|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization|Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //######################################|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |   Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //######################################|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |        |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,    S<4, 32, 2>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,    S<4, 32, 2>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck