Merge branch 'develop' of https://github.com/ROCm/composable_kernel into update_cka8w8

c8c016dd · aska-0096 · e8ca3daf · 4e731776 · c8c016dd · c8c016dd
Commit c8c016dd authored Dec 13, 2024 by aska-0096
20 changed files
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -4,6 +4,7 @@
 #pragma once

 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"

 namespace ck_tile {

@@ -52,6 +53,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;

+        // TODO: this 8 is AK1! should be a policy parameter!
        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
            make_tuple(number<kKPerBlock / 8>{}, number<kMPerBlock>{}, number<8>{}),
            make_tuple(number<(kMPerBlock + 1) * 8>{}, number<8>{}, number<1>{}),
@@ -264,6 +266,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
                static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
                static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
                constexpr index_t M0 = MPerBlock / (M2 * M1);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M2, M1 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");

                return make_static_tile_distribution(
                    tile_distribution_encoding<sequence<1>,
@@ -277,6 +282,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
            {
                constexpr index_t M0 = BlockSize / get_warp_size();
                constexpr index_t M1 = MPerBlock / (M2 * M0);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M1, M2 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
                return make_static_tile_distribution(
                    tile_distribution_encoding<sequence<1>,
                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
@@ -350,6 +358,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
                static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error.");
                static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error.");
                constexpr index_t N0 = NPerBlock / (N2 * N1);
+                static_assert(N0 * N1 * N2 == NPerBlock,
+                              "Incorrect N0, N1, N2 configuration! "
+                              "N0, N1, N2 must cover whole NPerBlock!");

                return make_static_tile_distribution(
                    tile_distribution_encoding<sequence<1>,
@@ -364,7 +375,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
            {
                constexpr index_t N0 = BlockSize / get_warp_size();
                constexpr index_t N1 = NPerBlock / (N2 * N0);
-
+                static_assert(N0 * N1 * N2 == NPerBlock,
+                              "Incorrect N0, N1, N2 configuration! "
+                              "N0, N1, N2 must cover whole NPerBlock!");
                return make_static_tile_distribution(
                    tile_distribution_encoding<sequence<1>,
                                               tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
@@ -475,9 +488,28 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
    template <typename Problem>
    CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
    {
-        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1DefaultPolicy;
-
-        return BlockGemmASmemBSmemCRegV1<Problem, BlockGemmPolicy>{};
+        constexpr bool TransposeC = false;
+        constexpr auto I0         = number<0>{};
+        constexpr auto I1         = number<1>{};
+        constexpr auto I2         = number<2>{};
+
+        using AccDataType     = float;
+        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                AccDataType,
+                                                WarpTile::at(I0),
+                                                WarpTile::at(I1),
+                                                WarpTile::at(I2),
+                                                TransposeC>;
+        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
+                                                                      typename Problem::BDataType,
+                                                                      typename Problem::CDataType,
+                                                                      BlockWarps,
+                                                                      WarpGemm>;
+
+        return BlockUniversalGemmAsBsCr<Problem, BlockGemmPolicy>{};
    }
 };


--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -33,6 +33,8 @@ struct GemmPipelineProblemBase
    static constexpr bool kPadN = GemmTraits::kPadN;
    static constexpr bool kPadK = GemmTraits::kPadK;

+    static constexpr auto Scheduler = GemmPipelineScheduler::Default;
+
    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentA()
    {
        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)

--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -10,114 +10,134 @@
 namespace ck_tile {

 // fp16
-using WarpGemmMfmaF16F16F32M32N32K8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K8>>;

-using WarpGemmMfmaF16F16F32M16N16K16 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16>>;
+using WarpGemmMfmaF16F16F32M32N32K8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;

-using WarpGemmMfmaF16F16F32M32N32K16 =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplF16F16F32M32N32K8, 2>>;
+using WarpGemmMfmaF16F16F32M16N16K16 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;

-using WarpGemmMfmaF16F16F32M16N16K32 =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplF16F16F32M16N16K16, 2>>;
+using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
+    2>>;

-using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<
-    WarpGemmAtrributeMfmaIterateK_SwizzleA<WarpGemmAttributeMfmaImplF16F16F32M32N32K8, 1>>;
+using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
+    2>>;

-using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl<
-    WarpGemmAtrributeMfmaIterateK_SwizzleA<WarpGemmAttributeMfmaImplF16F16F32M32N32K8, 2>>;
+using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+    WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
+    1>>;

-using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplF16F16F32M32N32K8>>;
+using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+    WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
+    2>>;

-using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplF16F16F32M16N16K16>>;
+using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;

 using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
-        WarpGemmAttributeMfmaImplF16F16F32M32N32K8,
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
        2>>;

 using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
-        WarpGemmAttributeMfmaImplF16F16F32M16N16K16,
+        WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
        2>>;

 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
-        WarpGemmAttributeMfmaImplF16F16F32M32N32K8,
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
        2>>;

 // bf16
-using WarpGemmMfmaBf16Bf16F32M32N32K8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8>>;

-using WarpGemmMfmaBf16Bf16F32M16N16K16 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16>>;
+using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfmaBf16Bf16F32M16N16K16 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;

-using WarpGemmMfmaBf16Bf16F32M32N32K16 =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8, 2>>;
+using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
+    2>>;

-using WarpGemmMfmaBf16Bf16F32M16N16K32 =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16, 2>>;
+using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
+    2>>;

-using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<
-    WarpGemmAtrributeMfmaIterateK_SwizzleA<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8, 1>>;
+using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+    WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
+    1>>;

-using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA = WarpGemmImpl<
-    WarpGemmAtrributeMfmaIterateK_SwizzleA<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8, 2>>;
+using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
+        2>>;

-using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8>>;
+using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;

-using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16>>;
+using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;

 using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8,
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
        2>>;

 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16,
+        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
        2>>;

 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8,
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
        2>>;

 // fp8
-using WarpGemmMfma_f32_32x32x16_fp8_fp8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8>>;

-using WarpGemmMfma_f32_32x32x16_fp8_bf8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8>>;
+using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_32x32x16_fp8_bf8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;

-using WarpGemmMfma_f32_32x32x16_bf8_fp8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8>>;
+using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;

-using WarpGemmMfma_f32_32x32x16_bf8_bf8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8>>;
+using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;

-using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8>>;
+using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;

-using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8>>;
+using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;

-using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8>>;
+using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;

-using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8>>;
+using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;

 template <index_t swizzle_factor = 2>
 using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
-        WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t>,
+        WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, WGAttrCtlEnum::Default_>,
        2,
        swizzle_factor>>;


--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -21,9 +21,12 @@ struct WarpGemmAtrributeMfma
    using BVecType = typename Impl::BVecType;
    using CVecType = typename Impl::CVecType;

-    static constexpr index_t kM = Impl::kM;
-    static constexpr index_t kN = Impl::kN;
-    static constexpr index_t kK = Impl::kK;
+    static constexpr index_t kM          = Impl::kM;
+    static constexpr index_t kN          = Impl::kN;
+    static constexpr index_t kK          = Impl::kK;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane;
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }

    using AWarpDstrEncoding = tile_distribution_encoding<
        sequence<>,
@@ -51,10 +54,13 @@ struct WarpGemmAtrributeMfma
        sequence<0, 2>>;

    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
-        Impl{}(c_vec, a_vec, b_vec);
+        Impl{}(c_vec, a_vec, b_vec, bool_constant<post_nop_>{});
    }

    // c_vec = a_vec * b_vec
@@ -81,9 +87,12 @@ struct WarpGemmAtrributeMfmaIterateK
        ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
    using CVecType = typename Impl::CVecType;

-    static constexpr index_t kM = Impl::kM;
-    static constexpr index_t kN = Impl::kN;
-    static constexpr index_t kK = Impl::kK * kKIter;
+    static constexpr index_t kM          = Impl::kM;
+    static constexpr index_t kN          = Impl::kN;
+    static constexpr index_t kK          = Impl::kK * kKIter;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter;
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }

    using AWarpDstrEncoding = tile_distribution_encoding<
        sequence<>,
@@ -111,8 +120,11 @@ struct WarpGemmAtrributeMfmaIterateK
        sequence<0, 2>>;

    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
@@ -122,10 +134,33 @@ struct WarpGemmAtrributeMfmaIterateK
                   reinterpret_cast<const buf_a&>(a_vec)
                       .template get_as<typename Impl::AVecType>()[iKIter],
                   reinterpret_cast<const buf_b&>(b_vec)
-                       .template get_as<typename Impl::BVecType>()[iKIter]);
+                       .template get_as<typename Impl::BVecType>()[iKIter],
+                   bool_constant<post_nop_>{});
        });
    }

+    template <index_t iKIter, bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   number<iKIter>,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+
+        static_assert(iKIter < kKIter);
+
+        // static_for<0, kKIter, 1>{}([&](auto iKIter) {
+        Impl{}(c_vec,
+               reinterpret_cast<const buf_a&>(a_vec)
+                   .template get_as<typename Impl::AVecType>()[iKIter],
+               reinterpret_cast<const buf_b&>(b_vec)
+                   .template get_as<typename Impl::BVecType>()[iKIter],
+               bool_constant<post_nop_>{});
+        //});
+    }
+
    // c_vec = a_vec * b_vec
    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
    {
@@ -164,9 +199,12 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution
    using BVecType = typename Impl::AVecType;
    using CVecType = typename Impl::CVecType;

-    static constexpr index_t kM = Impl::kN;
-    static constexpr index_t kN = Impl::kM;
-    static constexpr index_t kK = Impl::kK;
+    static constexpr index_t kM          = Impl::kN;
+    static constexpr index_t kN          = Impl::kM;
+    static constexpr index_t kK          = Impl::kK;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane;
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }

    using AWarpDstrEncoding = tile_distribution_encoding<
        sequence<>,
@@ -194,11 +232,14 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution
        sequence<0, 2>>;

    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
        // swap A and B
-        Impl{}(c_vec, b_vec, a_vec);
+        Impl{}(c_vec, b_vec, a_vec, bool_constant<post_nop_>{});
    }

    // c_vec = a_vec * b_vec
@@ -222,9 +263,12 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
    using BVecType = typename Impl::AVecType;
    using CVecType = typename Impl::CVecType;

-    static constexpr index_t kM = Impl::kN;
-    static constexpr index_t kN = Impl::kM;
-    static constexpr index_t kK = Impl::kK;
+    static constexpr index_t kM          = Impl::kN;
+    static constexpr index_t kN          = Impl::kM;
+    static constexpr index_t kK          = Impl::kK;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane;
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }

    using AWarpDstrEncoding = tile_distribution_encoding<
        sequence<>,
@@ -255,12 +299,15 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
        sequence<2, 2>,
        sequence<0, 2>>;

+    template <bool post_nop_ = false>
    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
        // swap A and B
-        Impl{}(c_vec, b_vec, a_vec);
+        Impl{}(c_vec, b_vec, a_vec, bool_constant<post_nop_>{});
    }

    // c_vec = a_vec * b_vec
@@ -287,9 +334,12 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
        ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
    using CVecType = typename Impl::CVecType;

-    static constexpr index_t kM = Impl::kN;
-    static constexpr index_t kN = Impl::kM;
-    static constexpr index_t kK = Impl::kK * kKIter;
+    static constexpr index_t kM          = Impl::kN;
+    static constexpr index_t kN          = Impl::kM;
+    static constexpr index_t kK          = Impl::kK * kKIter;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter;
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }

    using AWarpDstrEncoding = tile_distribution_encoding<
        sequence<>,
@@ -316,9 +366,12 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
        sequence<2, 2>,
        sequence<0, 2>>;

+    template <bool post_nop_ = false>
    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
@@ -328,10 +381,34 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
                   reinterpret_cast<const buf_b&>(b_vec)
                       .template get_as<typename Impl::BVecType>()[iKIter],
                   reinterpret_cast<const buf_a&>(a_vec)
-                       .template get_as<typename Impl::AVecType>()[iKIter]);
+                       .template get_as<typename Impl::AVecType>()[iKIter],
+                   bool_constant<post_nop_>{});
        });
    }

+    template <index_t iKIter, bool post_nop_ = false>
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   number<iKIter>,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+
+        static_assert(iKIter < kKIter);
+        // swap A and B, value and type
+        // static_for<0, kKIter, 1>{}([&](auto iKIter) {
+        Impl{}(c_vec,
+               reinterpret_cast<const buf_b&>(b_vec)
+                   .template get_as<typename Impl::BVecType>()[iKIter],
+               reinterpret_cast<const buf_a&>(a_vec)
+                   .template get_as<typename Impl::AVecType>()[iKIter],
+               bool_constant<post_nop_>{});
+        //});
+    }
+
    // c_vec = a_vec * b_vec
    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
    {
@@ -372,10 +449,13 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
        ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
    using CVecType = typename Impl::CVecType;

-    static constexpr index_t kM      = Impl::kN;
-    static constexpr index_t kN      = Impl::kM;
-    static constexpr index_t kK      = Impl::kK * kKIter;
-    static constexpr index_t SFactor = SFactor_; // group how many CM1 together
+    static constexpr index_t kM          = Impl::kN;
+    static constexpr index_t kN          = Impl::kM;
+    static constexpr index_t kK          = Impl::kK * kKIter;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter;
+    static constexpr index_t SFactor     = SFactor_; // group how many CM1 together
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }

    using AWarpDstrEncoding = tile_distribution_encoding<
        sequence<>,
@@ -429,8 +509,11 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
        sequence<0, 2>>;
 #endif
    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
@@ -440,10 +523,33 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
                   reinterpret_cast<const buf_b&>(b_vec)
                       .template get_as<typename Impl::BVecType>()[iKIter],
                   reinterpret_cast<const buf_a&>(a_vec)
-                       .template get_as<typename Impl::AVecType>()[iKIter]);
+                       .template get_as<typename Impl::AVecType>()[iKIter],
+                   bool_constant<post_nop_>{});
        });
    }

+    template <index_t iKIter, bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   number<iKIter>,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+
+        static_assert(iKIter < kKIter);
+        // swap A and B, value and type
+        // static_for<0, kKIter, 1>{}([&](auto iKIter) {
+        Impl{}(c_vec,
+               reinterpret_cast<const buf_b&>(b_vec)
+                   .template get_as<typename Impl::BVecType>()[iKIter],
+               reinterpret_cast<const buf_a&>(a_vec)
+                   .template get_as<typename Impl::AVecType>()[iKIter],
+               bool_constant<post_nop_>{});
+        //});
+    }
+
    // c_vec = a_vec * b_vec
    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
    {
@@ -483,10 +589,13 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA
        ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
    using CVecType = typename Impl::CVecType;

-    static constexpr index_t kM      = Impl::kM;
-    static constexpr index_t kN      = Impl::kN;
-    static constexpr index_t kK      = Impl::kK * kKIter;
-    static constexpr index_t SFactor = SFactor_; // group how many CM1 together
+    static constexpr index_t kM          = Impl::kM;
+    static constexpr index_t kN          = Impl::kN;
+    static constexpr index_t kK          = Impl::kK * kKIter;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter;
+    static constexpr index_t SFactor     = SFactor_; // group how many CM1 together
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }

    using AWarpDstrEncoding = tile_distribution_encoding<
        sequence<>,
@@ -518,8 +627,11 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA
        sequence<0, 2>>;

    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
@@ -529,10 +641,33 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA
                   reinterpret_cast<const buf_a&>(a_vec)
                       .template get_as<typename Impl::AVecType>()[iKIter],
                   reinterpret_cast<const buf_b&>(b_vec)
-                       .template get_as<typename Impl::BVecType>()[iKIter]);
+                       .template get_as<typename Impl::BVecType>()[iKIter],
+                   bool_constant<post_nop_>{});
        });
    }

+    template <index_t iKIter, bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   number<iKIter>,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+
+        static_assert(iKIter < kKIter);
+
+        // static_for<0, kKIter, 1>{}([&](auto iKIter) {
+        Impl{}(c_vec,
+               reinterpret_cast<const buf_a&>(a_vec)
+                   .template get_as<typename Impl::AVecType>()[iKIter],
+               reinterpret_cast<const buf_b&>(b_vec)
+                   .template get_as<typename Impl::BVecType>()[iKIter],
+               bool_constant<post_nop_>{});
+        //});
+    }
+
    // c_vec = a_vec * b_vec
    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
    {

--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -7,12 +7,68 @@

 namespace ck_tile {

+// TODO: refactor warp-gemm
+// currently there is a discrepency for vav/vva if we need transpose C/D
+// e.g. if we want A:agpr, B:vgpr, we have to use vva in WGAttrEnum
+// because we swap the A/B pointer in _impl code (but not known this info here)
+enum class WGAttrCtlEnum
+{
+    Default_ = 0,
+    Raw_vvv  = 1, // c-vgpr, a-vgpr, b-vgpr
+    Raw_vaa  = 2, // c-vgpr, a-agpr, b-agpr
+    Raw_vav  = 3, // c-vgpr, a-agpr, b-vgpr
+    Raw_vva  = 4, // c-vgpr, a-vgpr, b-agpr
+    Raw_avv  = 5, // c-agpr, a-vgpr, b-vgpr
+    // raw_a_a_a = 3,  // c-agpr, a-agpr, b-agpr
+};
+
+#define DISPATCH_MFMA_(mfma_, dmod_, amod_, bmod_, cmod_)       \
+    if constexpr(post_nop_)                                     \
+    {                                                           \
+        asm volatile(mfma_ " %0, %1, %2, %3 ; yyy\n"            \
+                           "s_nop 3"                            \
+                     : dmod_(c_vec)                             \
+                     : amod_(a_vec), bmod_(b_vec), cmod_(c_vec) \
+                     :);                                        \
+    }                                                           \
+    else                                                        \
+    {                                                           \
+        asm volatile(mfma_ " %0, %1, %2, %3\n"                  \
+                     : dmod_(c_vec)                             \
+                     : amod_(a_vec), bmod_(b_vec), cmod_(c_vec) \
+                     :);                                        \
+    }
+
+#define DISPATCH_MFMA_CTRL_(mfma_, ctrl_)              \
+    if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vvv)      \
+    {                                                  \
+        DISPATCH_MFMA_(mfma_, "+v", "v", "v", "v")     \
+    }                                                  \
+    else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vaa) \
+    {                                                  \
+        DISPATCH_MFMA_(mfma_, "+v", "a", "a", "v")     \
+    }                                                  \
+    else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vav) \
+    {                                                  \
+        DISPATCH_MFMA_(mfma_, "+v", "a", "v", "v")     \
+    }                                                  \
+    else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vva) \
+    {                                                  \
+        DISPATCH_MFMA_(mfma_, "+v", "v", "a", "v")     \
+    }                                                  \
+    else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_avv) \
+    {                                                  \
+        DISPATCH_MFMA_(mfma_, "+a", "v", "v", "a")     \
+    }
+
 // FP16
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
 {
-    using ADataType = fp16_t;
-    using BDataType = fp16_t;
-    using CDataType = float;
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using CDataType                     = float;

    using AVecType = ext_vector_t<fp16_t, 4>;
    using BVecType = ext_vector_t<fp16_t, 4>;
@@ -33,16 +89,23 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
    static constexpr index_t kCM1PerLane = 4;

    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x8f16", Ctrl)
+        else
+        {
 #if defined(__gfx9__)
-        c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0);
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0);
 #else
-        ignore = c_vec;
-        ignore = a_vec;
-        ignore = b_vec;
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
 #endif
+        }
    }

    // c_vec = a_vec * b_vec
@@ -52,18 +115,20 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
        return bit_cast<CVecType>(
            __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0));
 #else
-        ignore = a_vec;
-        ignore = b_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
        return CVecType{0.f};
 #endif
    }
 };

+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
 {
-    using ADataType = fp16_t;
-    using BDataType = fp16_t;
-    using CDataType = float;
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using CDataType                     = float;

    using AVecType = ext_vector_t<fp16_t, 4>;
    using BVecType = ext_vector_t<fp16_t, 4>;
@@ -84,16 +149,23 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
    static constexpr index_t kCM1PerLane = 4;

    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x16f16", Ctrl)
+        else
+        {
 #if defined(__gfx9__)
-        c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0);
+            c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0);
 #else
-        ignore = c_vec;
-        ignore = a_vec;
-        ignore = b_vec;
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
 #endif
+        }
    }

    // c_vec = a_vec * b_vec
@@ -103,19 +175,21 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
        return bit_cast<CVecType>(
            __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
 #else
-        ignore = a_vec;
-        ignore = b_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
        return CVecType{0.f};
 #endif
    }
 };

 // Bf16
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
 {
-    using ADataType = bf16_t;
-    using BDataType = bf16_t;
-    using CDataType = float;
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = bf16_t;
+    using BDataType                     = bf16_t;
+    using CDataType                     = float;

    using AVecType = ext_vector_t<bf16_t, 4>;
    using BVecType = ext_vector_t<bf16_t, 4>;
@@ -136,28 +210,35 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
    static constexpr index_t kCM1PerLane = 4;

    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x8bf16_1k", Ctrl)
+        else
+        {
 #if defined(__gfx90a__) || defined(__gfx94__)
-        c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
 #elif defined(__gfx908__)
-        static_for<0, 2, 1>{}([&](auto k) {
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16(
-                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
-                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
-                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
-                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
-                c_vec,
-                0,
-                0,
-                0);
-        });
+            static_for<0, 2, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16(
+                    reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
 #else
-        ignore = c_vec;
-        ignore = a_vec;
-        ignore = b_vec;
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
 #endif
+        }
    }

    // c_vec = a_vec * b_vec
@@ -181,18 +262,20 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
        });
        return c_vec;
 #else
-        ignore = a_vec;
-        ignore = b_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
        return CVecType{0.f};
 #endif
    }
 };

+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
 {
-    using ADataType = bf16_t;
-    using BDataType = bf16_t;
-    using CDataType = float;
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = bf16_t;
+    using BDataType                     = bf16_t;
+    using CDataType                     = float;

    using AVecType = ext_vector_t<bf16_t, 4>;
    using BVecType = ext_vector_t<bf16_t, 4>;
@@ -213,28 +296,34 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
    static constexpr index_t kCM1PerLane = 4;

    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x16bf16_1k", Ctrl)
+        {
 #if defined(__gfx90a__) || defined(__gfx94__)
-        c_vec = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+            c_vec = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
 #elif defined(__gfx908__)
-        static_for<0, 2, 1>{}([&](auto k) {
-            c_vec = __builtin_amdgcn_mfma_f32_16x16x8bf16(
-                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
-                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
-                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
-                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
-                c_vec,
-                0,
-                0,
-                0);
-        });
+            static_for<0, 2, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_16x16x8bf16(
+                    reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
 #else
-        ignore = c_vec;
-        ignore = a_vec;
-        ignore = b_vec;
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
 #endif
+        }
    }

    // c_vec = a_vec * b_vec
@@ -258,20 +347,21 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
        });
        return c_vec;
 #else
-        ignore = a_vec;
-        ignore = b_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
        return CVecType{0.f};
 #endif
    }
 };

 // FP8
-template <typename AType_, typename BType_>
+template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
 {
-    using ADataType = AType_;
-    using BDataType = BType_;
-    using CDataType = float;
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = AType_;
+    using BDataType                     = BType_;
+    using CDataType                     = float;

    using AVecType = ext_vector_t<ADataType, 8>;
    using BVecType = ext_vector_t<BDataType, 8>;
@@ -292,38 +382,120 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
    static constexpr index_t kCM1PerLane = 4;

    // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
    {
+        if constexpr(Ctrl == WGAttrCtlEnum::Raw_vvv)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "v", "v", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vaa)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "a", "a", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vav)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "a", "v", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vva)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "v", "a", "v")
+            }
+        }
+        else
+        {
 #if defined(__gfx94__)
-        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
-        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
-        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
-        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
 #elif defined(__gfx908__) || defined(__gfx90a__)
-        static_for<0, 8, 1>{}([&](auto k) {
-            float a_f32 =
-                type_convert<float>(reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
-                                        .template get_as<ADataType>()[number<k>{}]);
-            float b_f32 =
-                type_convert<float>(reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
-                                        .template get_as<BDataType>()[number<k>{}]);
-
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0);
-        });
+            static_for<0, 8, 1>{}([&](auto k) {
+                float a_f32 =
+                    type_convert<float>(reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                                            .template get_as<ADataType>()[number<k>{}]);
+                float b_f32 =
+                    type_convert<float>(reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                                            .template get_as<BDataType>()[number<k>{}]);
+
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0);
+            });
 #else
-        ignore = c_vec;
-        ignore = a_vec;
-        ignore = b_vec;
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
 #endif
+        }
    }

    // c_vec = a_vec * b_vec
@@ -356,20 +528,97 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
        });
        return c_vec;
 #else
-        ignore = a_vec;
-        ignore = b_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
        return CVecType{0.f};
 #endif
    }
 };

+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, bf8_t>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, bf8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, fp8_t>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, fp8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, bf8_t>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, bf8_t, Ctrl_>;
+
+// int8
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = int8_t;
+    using BDataType                     = int8_t;
+    using CDataType                     = int32_t;
+
+    using AVecType = ext_vector_t<ADataType, 8>;
+    using BVecType = ext_vector_t<BDataType, 8>;
+    using CVecType = ext_vector_t<CDataType, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_i32_32x32x16_i8", Ctrl)
+        else
+        {
+#if defined(__gfx94__)
+            c_vec = __builtin_amdgcn_mfma_i32_32x32x8i8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+#elif defined(__gfx908__) || defined(__gfx90a__)
+            static_for<0, 8, 1>{}([&](auto k) {
+                float a_f32 =
+                    type_convert<float>(reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                                            .template get_as<ADataType>()[number<k>{}]);
+                float b_f32 =
+                    type_convert<float>(reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                                            .template get_as<BDataType>()[number<k>{}]);
+
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0);
+            });
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        CVecType c_vec{0};
+        operator()(c_vec, a_vec, b_vec);
+        return c_vec;
+    }
+};
+
+#undef DISPATCH_MFMA_

 } // namespace ck_tile
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -21,40 +21,40 @@ struct WarpGemmMfmaDispatcher;

 // clang-format off
 // fp16
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; };

-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };

 // bf16
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; };

-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };

 // fp8
-template<> struct WarpGemmMfmaDispatcher<fp8_t, fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<fp8_t, fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<fp8_t, bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<fp8_t, bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<bf8_t, fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<bf8_t, fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<bf8_t, bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<bf8_t, bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };

 // clang-format on
 } // namespace impl

--- a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -14,6 +14,11 @@ struct WarpGemmImpl
    static constexpr index_t kM = WarpGemmAttribute::kM;
    static constexpr index_t kN = WarpGemmAttribute::kN;
    static constexpr index_t kK = WarpGemmAttribute::kK;
+    /// @brief The number of elements in K dimension processed by single thread in wavefront.
+    ///
+    /// @note  Note that WarpGemm may run MFMA instruction multiple times (on different K).
+    ///        In such situation this value reflects this fact.
+    static constexpr index_t kKPerThread = WarpGemmAttribute::kKPerThread;

    using ADataType = typename WarpGemmAttribute::ADataType;
    using BDataType = typename WarpGemmAttribute::BDataType;
@@ -31,11 +36,21 @@ struct WarpGemmImpl
    using BWarpTensor = static_distributed_tensor<BDataType, BWarpDstr>;
    using CWarpTensor = static_distributed_tensor<CDataType, CWarpDstr>;

-    CK_TILE_DEVICE void operator()(CWarpTensor& c, const AWarpTensor& a, const BWarpTensor& b) const
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access()
    {
-        using AVec = ext_vector_t<ADataType, AWarpTensor::get_thread_buffer_size()>;
-        using BVec = ext_vector_t<BDataType, BWarpTensor::get_thread_buffer_size()>;
-        using CVec = ext_vector_t<CDataType, CWarpTensor::get_thread_buffer_size()>;
+        return WarpGemmAttribute_::get_num_of_access();
+    }
+
+    template <typename CTensor, typename ATensor, typename BTensor, bool post_nop_ = false>
+    CK_TILE_DEVICE void
+    operator()(CTensor& c, const ATensor& a, const BTensor& b, bool_constant<post_nop_> = {}) const
+    {
+        static_assert(detail::is_similiar_distributed_tensor_v<CTensor, CWarpTensor> &&
+                      detail::is_similiar_distributed_tensor_v<ATensor, AWarpTensor> &&
+                      detail::is_similiar_distributed_tensor_v<BTensor, BWarpTensor>);
+        using AVec = ext_vector_t<ADataType, ATensor::get_thread_buffer_size()>;
+        using BVec = ext_vector_t<BDataType, BTensor::get_thread_buffer_size()>;
+        using CVec = ext_vector_t<CDataType, CTensor::get_thread_buffer_size()>;

        constexpr auto I0 = number<0>{};

@@ -44,18 +59,49 @@ struct WarpGemmImpl
        auto c_vec       = c.get_thread_buffer().template get_as<CVec>()[I0];

        // c_vec += a_vec * b_vec
-        WarpGemmAttribute{}(c_vec, a_vec, b_vec);
+        WarpGemmAttribute{}(c_vec, a_vec, b_vec, bool_constant<post_nop_>{});

        c.get_thread_buffer().template set_as<CVec>(I0, c_vec);
    }

-    CK_TILE_DEVICE auto operator()(const AWarpTensor& a, const BWarpTensor& b) const
+    template <typename CTensor,
+              typename ATensor,
+              typename BTensor,
+              index_t i_subk,
+              bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CTensor& c,
+                                   const ATensor& a,
+                                   const BTensor& b,
+                                   number<i_subk>,
+                                   bool_constant<post_nop_> = {}) const
    {
-        CWarpTensor c;
+        using AVec = ext_vector_t<ADataType, ATensor::get_thread_buffer_size()>;
+        using BVec = ext_vector_t<BDataType, BTensor::get_thread_buffer_size()>;
+        using CVec = ext_vector_t<CDataType, CTensor::get_thread_buffer_size()>;
+
+        constexpr auto I0 = number<0>{};

-        using AVec = ext_vector_t<ADataType, AWarpTensor::get_thread_buffer_size()>;
-        using BVec = ext_vector_t<BDataType, BWarpTensor::get_thread_buffer_size()>;
-        using CVec = ext_vector_t<CDataType, CWarpTensor::get_thread_buffer_size()>;
+        const auto a_vec = a.get_thread_buffer().template get_as<AVec>()[I0];
+        const auto b_vec = b.get_thread_buffer().template get_as<BVec>()[I0];
+        auto c_vec       = c.get_thread_buffer().template get_as<CVec>()[I0];
+
+        // c_vec += a_vec * b_vec
+        WarpGemmAttribute{}(c_vec, a_vec, b_vec, number<i_subk>{}, bool_constant<post_nop_>{});
+
+        c.get_thread_buffer().template set_as<CVec>(I0, c_vec);
+    }
+
+    template <typename ATensor, typename BTensor>
+    CK_TILE_DEVICE auto operator()(const ATensor& a, const BTensor& b) const
+    {
+        using CTensor = CWarpTensor;
+        static_assert(detail::is_similiar_distributed_tensor_v<ATensor, AWarpTensor> &&
+                      detail::is_similiar_distributed_tensor_v<BTensor, BWarpTensor>);
+        CTensor c;
+
+        using AVec = ext_vector_t<ADataType, ATensor::get_thread_buffer_size()>;
+        using BVec = ext_vector_t<BDataType, BTensor::get_thread_buffer_size()>;
+        using CVec = ext_vector_t<CDataType, CTensor::get_thread_buffer_size()>;

        constexpr auto I0 = number<0>{};


--- a/include/ck_tile/ops/smoothquant.hpp
+++ b/include/ck_tile/ops/smoothquant.hpp
@@ -3,6 +3,7 @@

 #pragma once

+#include "ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp"
 #include "ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp"
 #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp"
 #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp"

--- a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+
+namespace ck_tile {
+
+// host side args
+struct MoeSmoothquantHostArgs
+{
+    const void* p_x;        // [tokens ,hidden_size], input, fp16/bf16
+    const void* p_xscale;   // [experts, hidden_size], input, columnwise scale, fp32
+    const void* p_topk_ids; // [tokens, topk]
+
+    void* p_yscale; // [topk * tokens,  1], output, rowwise quant scale
+    void* p_qy;     // [topk * tokens, hidden_size], output
+
+    index_t tokens;
+    index_t hidden_size;
+    index_t experts;
+    index_t topk;
+    index_t x_stride; // input x row stride
+    index_t y_stride; // output y stride(stride for topk)
+};
+
+// TODO: Extract some type to wrapper class
+template <typename Pipeline_>
+struct MoeSmoothquant
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = typename Pipeline::Problem;
+
+    using XDataType       = remove_cvref_t<typename Problem::XDataType>;
+    using XScaleDataType  = remove_cvref_t<typename Problem::XScaleDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+    using YScaleDataType  = remove_cvref_t<typename Problem::YScaleDataType>;
+    using QYDataType      = remove_cvref_t<typename Problem::QYDataType>;
+
+    static constexpr index_t Block_M = Problem::BlockShape::Block_M;
+    static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+    static constexpr bool kPadM      = false; // always no need to pad along M
+    static constexpr bool kPadN      = Problem::kPadN;
+    static constexpr bool kTwoPass   = Problem::kTwoPass;
+
+    static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
+    static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
+    static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
+
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+
+    static_assert(Problem::BlockShape::Repeat_M == 1);
+
+    struct Kargs
+    {
+        const void* p_x;        // [tokens ,hidden_size], input, fp16/bf16
+        const void* p_xscale;   // [experts, hidden_size], input, columnwise scale, fp32
+        const void* p_topk_ids; // [tokens, topk]
+
+        void* p_yscale; // [topk, tokens, 1], output, rowwise quant scale
+        void* p_qy;     // [topk, tokens, hidden_size], output
+
+        index_t tokens;
+        index_t hidden_size;
+        index_t experts;
+        index_t topk;
+        index_t x_stride; // input x row stride
+        index_t y_stride; // output y stride(stride for topk)
+    };
+    using Hargs = MoeSmoothquantHostArgs;
+
+    CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
+    {
+        return Kargs{hargs.p_x,
+                     hargs.p_xscale,
+                     hargs.p_topk_ids,
+                     hargs.p_yscale,
+                     hargs.p_qy,
+                     hargs.tokens,
+                     hargs.hidden_size,
+                     hargs.experts,
+                     hargs.topk,
+                     hargs.x_stride,
+                     hargs.y_stride};
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
+    {
+        return dim3(hargs.topk, integer_divide_ceil(hargs.tokens, Block_M), 1);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+
+    // clang-format off
+    template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
+    template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
+    template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
+    template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    // clang-format on
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); }
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // clang-format off
+        using S_ = typename Problem::BlockShape;
+        auto surfix = [&] () {
+            std::string n;
+            if (kPadN) n += "_pn";
+            if (kTwoPass) n += "_2p";
+            return n; }();
+
+        #define _SS_  std::string
+        #define _TS_  std::to_string
+        return _SS_("moe_smoothquant_") + _SS_(t2s<XDataType>::name) + "_" +
+             _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" +
+             _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" +
+             _SS_(Pipeline::name) + surfix;
+        #undef _SS_
+        #undef _TS_
+        // clang-format on
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        const index_t i_topk  = blockIdx.x;
+        const index_t i_token = blockIdx.y * Block_M;
+        const index_t i_token_in_thrd =
+            __builtin_amdgcn_readfirstlane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N);
+
+        const index_t i_expert = reinterpret_cast<const index_t*>(
+            kargs.p_topk_ids)[(i_token + i_token_in_thrd) * kargs.topk + i_topk];
+
+        // [tokens ,hidden_size]
+        const auto x_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XDataType*>(kargs.p_x),
+                make_tuple(kargs.tokens, kargs.hidden_size),
+                make_tuple(kargs.x_stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {i_token, 0});
+        }();
+
+        // [experts, hidden_size],
+        const auto xscale_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XScaleDataType*>(kargs.p_xscale) + i_expert * kargs.hidden_size,
+                make_tuple(kargs.hidden_size),
+                make_tuple(1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadN>{});
+
+            return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
+        }();
+
+        // [topk, tokens]
+        auto yscale_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<YScaleDataType*>(kargs.p_yscale) + i_topk * kargs.tokens,
+                make_tuple(kargs.tokens),
+                make_tuple(1),
+                number<1>{});
+
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_M>{}), sequence<kPadM>{});
+
+            return make_tile_window(tmp2_, make_tuple(number<Block_M>{}), {i_token});
+        }();
+
+        // [topk, tokens, hidden_size]
+        auto qy_window = [&]() {
+            auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<QYDataType*>(kargs.p_qy) + i_topk * kargs.tokens * kargs.y_stride,
+                make_tuple(kargs.tokens, kargs.hidden_size),
+                make_tuple(kargs.y_stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {i_token, 0});
+        }();
+
+        __shared__ char smem[GetSmemSize()];
+
+        Pipeline{}(x_window, xscale_window, yscale_window, qy_window, kargs.hidden_size, smem);
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
@@ -19,7 +19,8 @@ struct SmoothquantHostArgs

    index_t m;
    index_t n;
-    index_t stride; // row_stride
+    index_t x_stride; // input row_stride
+    index_t y_stride; // output row_stride
 };

 // TODO: Extract some type to wrapper class
@@ -58,14 +59,21 @@ struct Smoothquant

        index_t m;
        index_t n;
-        index_t stride; // row_stride
+        index_t x_stride; // input row_stride
+        index_t y_stride; // out row_stride
    };
    using Hargs = SmoothquantHostArgs;

    CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
    {
-        return Kargs{
-            hargs.p_x, hargs.p_xscale, hargs.p_yscale, hargs.p_qy, hargs.m, hargs.n, hargs.stride};
+        return Kargs{hargs.p_x,
+                     hargs.p_xscale,
+                     hargs.p_yscale,
+                     hargs.p_qy,
+                     hargs.m,
+                     hargs.n,
+                     hargs.x_stride,
+                     hargs.y_stride};
    }

    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
@@ -116,7 +124,7 @@ struct Smoothquant
            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                static_cast<const XDataType*>(kargs.p_x),
                make_tuple(kargs.m, kargs.n),
-                make_tuple(kargs.stride, 1),
+                make_tuple(kargs.x_stride, 1),
                number<Vector_N>{},
                number<1>{});

@@ -157,7 +165,7 @@ struct Smoothquant
            auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                static_cast<QYDataType*>(kargs.p_qy),
                make_tuple(kargs.m, kargs.n),
-                make_tuple(kargs.stride, 1),
+                make_tuple(kargs.y_stride, 1),
                number<Vector_N>{},
                number<1>{});


--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
@@ -25,6 +25,7 @@ struct SmoothquantPipelineOnePass
    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
    static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr bool UseMax3            = true; // TODO - Move to trait

    static constexpr const char* name = []() {
        if constexpr(kNeedCrossWarpSync)
@@ -52,7 +53,15 @@ struct SmoothquantPipelineOnePass
            xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());

        auto reduce_absmax_func  = ReduceOp::AbsMax{};
-        auto reduce_max_func     = ReduceOp::Max{};
+        auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
+        auto reduce_max_func = ReduceOp::Max{};
+
        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
        auto block_reduce2d_cross_warp_sync =
@@ -68,8 +77,23 @@ struct SmoothquantPipelineOnePass
            xscale);

        // compute absmax, cross-lane->cross-warp
-        auto absmax = block_reduce2d(
-            y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+        auto absmax = [&]() {
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+            {
+                return block_reduce2d(y,
+                                      reduce_absmax_func.GetIdentityValue<ComputeDataType>(),
+                                      reduce_absmax3_func,
+                                      sequence<1, 2>{});
+            }
+            else
+            {
+                return block_reduce2d(
+                    y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+            }
+        }();
        block_reduce2d_sync(absmax, reduce_max_func);
        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);


--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
@@ -25,6 +25,7 @@ struct SmoothquantPipelineTwoPass
    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
    static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr bool UseMax3            = true; // TODO - Move to trait

    static constexpr const char* name = []() {
        if constexpr(kNeedCrossWarpSync)
@@ -56,6 +57,13 @@ struct SmoothquantPipelineTwoPass
            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));

        auto reduce_absmax_func  = ReduceOp::AbsMax{};
+        auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
        auto reduce_max_func     = ReduceOp::Max{};
        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
@@ -77,7 +85,13 @@ struct SmoothquantPipelineTwoPass
                x,
                xscale);

-            block_reduce2d(y, absmax, reduce_absmax_func);
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+                block_reduce2d(y, absmax, reduce_absmax3_func, sequence<1, 2>{});
+            else
+                block_reduce2d(y, absmax, reduce_absmax_func);

            move_tile_window(x_window, {0, Block_N});
            move_tile_window(xscale_window, {Block_N});

--- a/include/ck_tile/ref/README.md
+++ b/include/ck_tile/ref/README.md
+# reference
+
+this folder contains reference implementation of a specific op. Note by including a specific header, you are including the implementation(expecially the gpu implementation) into your source code, and compile that kernel into the fatbin, hence may increase your kernel obj code length. Usually the header starts with `reference_` is a cpu reference implementation. The header starts with `naive_` contains a gpu implementation with a small launcher.
+
+TODO: move `host/reference` under this folder
--- a/include/ck_tile/ref/naive_attention.hpp
+++ b/include/ck_tile/ref/naive_attention.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include <thread>
+#include <string>
+
+namespace ck_tile {
+
+enum class naive_attention_layout_enum
+{
+    BSHD,  // [batch, seqlen, nhead, hdim]
+    BHSD,  // [batch, nhead, seqlen, hdim]
+    BS3HD, // [batch, nhead, 3, seqlen, hdim], used when qkv are packed
+    PHSD,  // [pages, nhead, page_size, hdim]
+    // PHSDX, // [pages, nhead, page_size/x, hdim, x], where <# used pages>*page_size = seqlen
+    PHDSX, // [pages, nhead, hdim/x, page_size, x], where <# used pages>*page_size = seqlen
+    PHDS,  // [pages, nhead, hdim, page_size], where <# used pages>*page_size = seqlen
+};
+
+// will used to specialize kernel variation
+enum class naive_attention_variation_enum
+{
+    FLASH_BATCHED = 0, // standard flash attention, or xformer/sdpa, used for training
+    FLASH_GROUPED,
+    DECODE_PAGED, // decode attn, where kv token from another buffer called kvcache
+};
+
+// TODO: for simplicity, this will be used as host/device arg
+struct naive_attention_fwd_args
+{
+    void* q_ptr;
+    void* k_ptr;
+    void* v_ptr;
+    void* o_ptr;
+    void* context_len_ptr; // [batch] used when seqlen kv come from a pointer(each element is a
+                           // number, not cumsum)
+    void* page_table_ptr;  // [batch, max_pages_per_seq] seqlen_kv is in different block(paged attn)
+    void* kvscale_ptr;     // [nhead, 2(kv), hdim] used for kvcache dequant
+    float scale_s;
+    int hdim;
+    int hdim_v; // could be cross-attn, where V and Q/K hdim are different
+    int batch_q;
+    int batch_kv;
+    int batch_ratio_kv; // batch_q / batch_kv
+    int seqlen_q;       // in decode case, this should be 1
+    int seqlen_kv;      // if context_len_ptr is not nullptr, ignore this field
+    int nhead_q;
+    int nhead_kv;
+    int nhead_ratio_kv; // nhead_q / nhead_kv
+    int page_size;      // if paged, the seqlen-kv per each block
+    int max_pages_per_seq;
+};
+
+// this is trait for host API
+struct naive_attention_fwd_traits
+{
+    std::string q_type;
+    std::string k_type;
+    std::string v_type;
+    std::string o_type;
+    std::string q_layout;
+    std::string k_layout;
+    std::string v_layout;
+    std::string o_layout;
+    int variation; // sync with naive_attention_variation_enum
+};
+
+// this is trait for kernel template
+template <naive_attention_variation_enum variation_>
+struct naive_attention_fwd_kernel_traits
+{
+    static constexpr naive_attention_variation_enum variation = variation_;
+};
+
+// for simplicity, please do not use const-reference type for the template type
+template <typename QType,
+          typename KType,
+          typename VType,
+          typename OType,
+          typename AccType,
+          naive_attention_layout_enum QLayout,
+          naive_attention_layout_enum KLayout,
+          naive_attention_layout_enum VLayout,
+          naive_attention_layout_enum OLayout,
+          typename Traits>
+struct naive_attention_fwd_kernel
+{
+    static constexpr bool is_kvcache_i8 =
+        std::is_same_v<KType, int8_t> && std::is_same_v<VType, int8_t> && sizeof(QType) != 1;
+
+    // kvcache-i8 will have per head scale, we apply this scale to Q/P matrix instead of original
+    // K/V matrix. This can speed up conversion since Q/P usually is fp16/bf16/fp32
+    static constexpr bool is_kvcache_i8_forward_quant = is_kvcache_i8;
+
+    // TODO: hardcode
+    using KVScaleType = float;
+    using SoftmaxType = float;
+    using PType       = VType; // src A of gemm2, same type as V
+
+    using p_vec_type                = ext_vector_t<PType, 16 / sizeof(PType)>;
+    static constexpr int p_vec_elem = vector_traits<p_vec_type>::vector_size;
+
+    __host__ __device__ naive_attention_fwd_kernel() {}
+
+    template <typename T, naive_attention_layout_enum Layout>
+    struct addresser
+    {
+        int b, s, h, d; // batch, seqlen, nhead, hdim
+        T* base_ptr;
+        __device__ addresser(int b_, int s_, int h_, int d_, void* base_ptr_)
+            : b(b_), s(s_), h(h_), d(d_), base_ptr(reinterpret_cast<T*>(base_ptr_))
+        {
+        }
+
+        // TODO: all the batch/nhead offset will accumulate to the base pointer
+        __device__ T* get_base(int i_b, int i_h)
+        {
+            if constexpr(Layout == naive_attention_layout_enum::BSHD)
+                return base_ptr + i_b * s * h * d + i_h * d;
+            else if constexpr(Layout == naive_attention_layout_enum::BHSD)
+                return base_ptr + i_b * s * h * d + i_h * s * d;
+        }
+
+        __device__ int get_offset(int i_s, int i_d)
+        {
+            if constexpr(Layout == naive_attention_layout_enum::BSHD)
+                return i_s * h * d + i_d;
+            else if constexpr(Layout == naive_attention_layout_enum::BHSD)
+                return i_s * d + i_d;
+        }
+
+        // below set of API will directly use pointer inside this struct
+        __device__ void init(int i_b, int i_h) { base_ptr = get_base(i_b, i_h); }
+        __device__ T load(int i_s, int i_d) { return base_ptr[get_offset(i_s, i_d)]; }
+        __device__ void store(T value, int i_s, int i_d) { base_ptr[get_offset(i_s, i_d)] = value; }
+    };
+
+    template <typename T, naive_attention_layout_enum Layout>
+    struct page_addresser
+    {
+        int s, h, d;                             // page_size, nhead, hdim
+        static constexpr int x = 16 / sizeof(T); // pack 4 dword
+        T* base_ptr;
+        int* page_table_ptr; // TODO: page table always int
+        int i_h;             // store current head
+
+        __device__ page_addresser(int s_, int h_, int d_, void* base_ptr_, void* pptr_)
+            : s(s_),
+              h(h_),
+              d(d_),
+              base_ptr(reinterpret_cast<T*>(base_ptr_)),
+              page_table_ptr(reinterpret_cast<int*>(pptr_))
+        {
+        }
+
+        __device__ int64_t get_phy_page_idx(int i_s)
+        {
+            // dynamic compute page idx is simple but slow
+            int page_idx = i_s / s;
+            int phy      = page_table_ptr[page_idx];
+            return static_cast<int64_t>(phy);
+        }
+
+        __device__ int get_phy_page_offset(int i_s)
+        {
+            // dynamic compute page idx is simple but slow
+            return i_s % s;
+        }
+
+        __device__ int64_t get_offset(int i_s, int i_d)
+        {
+            int page_offset  = get_phy_page_offset(i_s);
+            int64_t page_idx = get_phy_page_idx(i_s);
+            int64_t base_    = page_idx * h * s * d;
+            if constexpr(Layout == naive_attention_layout_enum::PHSD)
+                return static_cast<int64_t>(i_h * s * d + page_offset * d + i_d) + base_;
+            else if constexpr(Layout == naive_attention_layout_enum::PHDSX)
+            {
+                int d_r = i_d / x;
+                int d_x = i_d % x;
+                return static_cast<int64_t>(i_h * d * s + d_r * s * x + page_offset * x + d_x) +
+                       base_;
+            }
+            else if constexpr(Layout == naive_attention_layout_enum::PHDS)
+            {
+                return static_cast<int64_t>(i_h * d * s + i_d * s + page_offset) + base_;
+            }
+        }
+
+        // below set of API will directly use pointer inside this struct
+        __device__ void init(int /*i_b*/, int i_h_) { i_h = i_h_; }
+        __device__ T load(int i_s, int i_d) { return base_ptr[get_offset(i_s, i_d)]; }
+        __device__ void store(T /*value*/, int /*i_s*/, int /*i_d*/) {}
+    };
+
+    template <typename T>
+    struct kvscale_addresser
+    {
+        int h, d; // nhead, hdim
+        T* base_ptr;
+        __device__ kvscale_addresser(int h_, int d_, void* p_)
+            : h(h_), d(d_), base_ptr(reinterpret_cast<T*>(p_))
+        {
+        }
+        __device__ int get_offset(int i_h, int i_d, int i_kv /*0 or 1*/)
+        {
+            // [h, 2, d]
+            return i_h * 2 * d + i_kv * d + i_d;
+        }
+        __device__ T load(int i_h, int i_d, int i_kv)
+        {
+            return base_ptr[get_offset(i_h, i_d, i_kv)];
+        }
+    };
+
+    __device__ __host__ static constexpr int get_block_size() { return 256; }
+
+    // for simpliciy, 1 WG always compute 1 token along q, compute all token along kv
+    // compute all hdim from q, compute WG_SIZE hdim from v
+    // 1) in prefill case, seqlen_q >= 1, seqlen_kv >= 1, batch_q=batch_kv
+    // 2) in decode case, seqlen_q = 1, batch_q is input num-tokens, batch_kv is 1
+    // 3) in paged-attn case, we still use 1 WG compute all the seqlen-kv for simplicity
+    // TODO: could support split-kv to validate intermediate logsum
+    __host__ static dim3 get_grid_size(naive_attention_fwd_args args)
+    {
+        constexpr int wg_size = get_block_size();
+        auto g =
+            dim3((args.hdim_v + wg_size - 1) / wg_size, args.seqlen_q, args.batch_q * args.nhead_q);
+        return g;
+    }
+
+    // reduce single pixel within a wave
+    template <typename T, typename F>
+    __device__ constexpr T wave_reduce(T local, F reduce_f)
+    {
+        // constexpr int wave_size = 64;
+        constexpr int reduce_stage = 6; // 1<<6=64
+        T v_local                  = local;
+#pragma unroll
+        for(int i_stage = 0; i_stage < reduce_stage; i_stage++)
+        {
+            int src_lane = __lane_id() ^ (1 << i_stage);
+            int32_t v_remote_tmp =
+                __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
+            T v_remote = bit_cast<T>(v_remote_tmp);
+            v_local    = reduce_f(v_local, v_remote);
+        }
+        return v_local;
+    }
+
+    // Note: this function must be called after wave_reduce
+    // Note: better not use this under if...else... with thread divergence (syncthreads)
+    template <typename T, typename F>
+    __device__ constexpr T cross_wave_reduce(T local, F reduce_f, T* smem)
+    {
+        constexpr int waves     = 4;
+        constexpr int wave_size = 64;
+        int lane_id             = threadIdx.x % wave_size;
+
+        __syncthreads();
+        smem[threadIdx.x] = local;
+        __syncthreads();
+
+        // the data within single wave is the same
+        // but for simplicity, we still use data from each lane.
+        T v_local = smem[lane_id];
+#pragma unroll
+        for(int i_stage = 1; i_stage < waves; i_stage++)
+        {
+            T v_remote = smem[i_stage * wave_size + lane_id];
+            v_local    = reduce_f(v_local, v_remote);
+        }
+        return v_local;
+    }
+
+    // kernel entry point
+    __device__ void operator()(naive_attention_fwd_args args)
+    {
+        constexpr int wg_size = get_block_size();
+        __shared__ char smem[wg_size * 4 * sizeof(float)]; //  should enough
+        int i_dv    = blockIdx.x * wg_size + threadIdx.x;  // index of hdim_v
+        int i_sq    = blockIdx.y;                          // index of seqlen_q
+        int i_batch = blockIdx.z;                          // index of batch_q * nhead_q
+        int i_bq    = i_batch / args.nhead_q;              // index of batch_q
+        int i_hq    = i_batch % args.nhead_q;              // index of nhead_q
+
+        int i_bk = i_bq / args.batch_ratio_kv;
+        int i_hk = i_hq / args.nhead_ratio_kv;
+
+        void* page_table_ptr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return reinterpret_cast<int*>(args.page_table_ptr) + i_bq * args.max_pages_per_seq;
+            }
+            else
+            {
+                return nullptr;
+            }
+        }();
+
+        auto q_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<QType, QLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim, args.q_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return addresser<QType, QLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim, args.q_ptr};
+            }
+        }();
+        auto k_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<KType, KLayout>{
+                    args.batch_kv, args.seqlen_kv, args.nhead_kv, args.hdim, args.k_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return page_addresser<KType, KLayout>{
+                    args.page_size, args.nhead_kv, args.hdim, args.k_ptr, page_table_ptr};
+            }
+        }();
+        auto v_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<VType, VLayout>{
+                    args.batch_kv, args.seqlen_kv, args.nhead_kv, args.hdim_v, args.v_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return page_addresser<VType, VLayout>{
+                    args.page_size, args.nhead_kv, args.hdim_v, args.v_ptr, page_table_ptr};
+            }
+        }();
+        auto o_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<OType, OLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim_v, args.o_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return addresser<OType, OLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim_v, args.o_ptr};
+            }
+        }();
+
+        q_addr.init(i_bq, i_hq);
+        k_addr.init(i_bk, i_hk);
+        v_addr.init(i_bk, i_hk);
+        o_addr.init(i_bq, i_hq);
+
+        auto f_max        = [](auto x_, auto y_) { return max(x_, y_); };
+        auto f_sum        = [](auto x_, auto y_) { return x_ + y_; };
+        auto f_absmax_f32 = [](float v_0_, float v_1_) {
+            float rtn;
+            asm volatile("v_max_f32 %0, abs(%1), abs(%2)" : "=v"(rtn) : "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
+
+        int seqlen_kv = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return args.seqlen_kv;
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return reinterpret_cast<int*>(args.context_len_ptr)[i_bq];
+            }
+        }();
+
+        SoftmaxType row_max = -numeric<SoftmaxType>::infinity();
+        SoftmaxType l{0};
+        AccType o_acc = {0};
+
+        int sk_loops   = (seqlen_kv + wg_size - 1) / wg_size;
+        float qf_scale = .0f;
+        kvscale_addresser<KVScaleType> kvscale_addr{args.nhead_kv, args.hdim, args.kvscale_ptr};
+
+        if constexpr(is_kvcache_i8_forward_quant)
+        {
+            // AccType is i32 now, seqlen_q = 1, hdim up to 256
+            float q   = 0;
+            float k_s = 0;
+            if(static_cast<int>(threadIdx.x) < args.hdim)
+            {
+                q   = type_convert<float>(q_addr.load(0, threadIdx.x));
+                k_s = type_convert<float>(kvscale_addr.load(i_hk, threadIdx.x, 0));
+            }
+            // 1) we apply the k scale to q
+            float q_forwarded = q * k_s;
+
+            // 2) apply smooth-quant
+            // find absmax
+            float qf_max = wave_reduce(q_forwarded, f_absmax_f32);
+            qf_max       = cross_wave_reduce(qf_max, f_absmax_f32, reinterpret_cast<float*>(smem));
+
+            // per-token scale
+            qf_scale = qf_max / 127.0;
+
+            // devide by scale
+            q = q / qf_scale;
+
+            // fp32->i8
+            int8_t quantized_q = static_cast<int8_t>(q);
+            __syncthreads();
+            reinterpret_cast<int8_t*>(smem)[threadIdx.x] = quantized_q;
+            __syncthreads();
+
+            // after above process, we have 2 data
+            // 1) int8 q data stored in smem(no need to reload)
+            // 2) per-token scale qf_scale, to be mul after 1st gemm
+        }
+
+        for(int i_loop1 = 0; i_loop1 < sk_loops; i_loop1++)
+        {
+            int i_sk = i_loop1 * wg_size + threadIdx.x;
+            // gemm-1
+            SoftmaxType s_softmax = -numeric<SoftmaxType>::infinity();
+            if(i_sk < seqlen_kv)
+            {
+                AccType s_acc{0}; // clear for every loop
+                for(auto i_dq = 0; i_dq < args.hdim; i_dq++)
+                {
+                    if constexpr(is_kvcache_i8_forward_quant)
+                    {
+                        int8_t q = reinterpret_cast<int8_t*>(smem)[i_dq];
+                        auto k   = k_addr.load(i_sk, i_dq);
+
+                        s_acc += type_convert<AccType>(q) * type_convert<AccType>(k);
+                    }
+                    else
+                    {
+                        auto q = q_addr.load(i_sq, i_dq); // q will have duplicate load
+                        auto k = k_addr.load(i_sk, i_dq);
+
+                        s_acc += type_convert<AccType>(q) * type_convert<AccType>(k);
+                    }
+                }
+                // scale
+                s_softmax = type_convert<SoftmaxType>(s_acc);
+                s_softmax *=
+                    type_convert<SoftmaxType>(args.scale_s * ck_tile::log2e_v<SoftmaxType>);
+                if constexpr(is_kvcache_i8_forward_quant)
+                {
+                    s_softmax *= qf_scale; // post scale the per-token factor
+                }
+            }
+
+            // s->p
+            float pf_scale = 0.; // used for i8 quant
+            {
+                // softmax, find max
+                SoftmaxType old_max = row_max;
+                SoftmaxType cur_max = wave_reduce(s_softmax, f_max);
+
+                cur_max = cross_wave_reduce(cur_max, f_max, reinterpret_cast<SoftmaxType*>(smem));
+                row_max = max(old_max, cur_max); // update row_max
+                // softmax, exp(i_elem - max)
+                SoftmaxType p_compute = __builtin_amdgcn_exp2f(s_softmax - row_max);
+
+                // compute exp_sum
+                SoftmaxType row_sum = wave_reduce(p_compute, f_sum);
+                row_sum = cross_wave_reduce(row_sum, f_sum, reinterpret_cast<SoftmaxType*>(smem));
+
+                // l, pre-scall o_acc
+                SoftmaxType tmp = __builtin_amdgcn_exp2f(old_max - row_max);
+                l               = tmp * l + row_sum;
+                o_acc           = type_convert<AccType>(type_convert<SoftmaxType>(o_acc) * tmp);
+
+                // prepare the p_compute into smem, to let every thread read same p_compute and do
+                // 2nd gemm
+                if constexpr(is_kvcache_i8_forward_quant)
+                {
+                    float v_s = 0;
+                    if(static_cast<int>(threadIdx.x) < args.hdim_v)
+                    {
+                        v_s = type_convert<float>(kvscale_addr.load(i_hk, threadIdx.x, 1));
+                    }
+
+                    // 1) we apply the v scale to p
+                    float p_forwarded = p_compute * v_s;
+
+                    // 2) apply smooth-quant
+                    // find absmax
+                    float pf_max = wave_reduce(p_forwarded, f_absmax_f32);
+                    pf_max =
+                        cross_wave_reduce(pf_max, f_absmax_f32, reinterpret_cast<float*>(smem));
+
+                    // per-token scale
+                    pf_scale = pf_max / 127.0;
+
+                    // devide by scale
+                    p_compute = p_compute / pf_scale;
+
+                    // fp32->i8
+                    int8_t quantized_p = static_cast<int8_t>(p_compute);
+                    __syncthreads();
+                    reinterpret_cast<int8_t*>(smem)[threadIdx.x] = quantized_p;
+                    __syncthreads();
+                    // after above process, we have 2 data
+                    // 1) int8 p data stored in smem(no need to reload)
+                    // 2) per-token scale pf_scale, to be mul after 2nd gemm
+                }
+                else
+                {
+                    __syncthreads();
+                    reinterpret_cast<PType*>(smem)[threadIdx.x] = type_convert<PType>(p_compute);
+                    __syncthreads();
+                }
+            }
+
+            // gemm-2, simple loop over vector by vector
+            constexpr int gemm_2_loop = wg_size / p_vec_elem;
+            {
+                AccType o_acc_local = {0};
+                int sk_start = i_loop1 * wg_size; // we start from the first seqlen_kv element
+                for(int i_loop2 = 0; i_loop2 < gemm_2_loop; i_loop2++)
+                {
+                    p_vec_type p_vec = reinterpret_cast<p_vec_type*>(smem)[i_loop2];
+#pragma unroll
+                    for(int i_j = 0; i_j < p_vec_elem; i_j++)
+                    {
+                        int sv_offset = i_loop2 * p_vec_elem + i_j;
+                        int i_sv      = sk_start + sv_offset;
+
+                        VType v = 0.f;
+                        if(i_dv < args.hdim_v && i_sv < seqlen_kv)
+                        {
+                            v = v_addr.load(i_sv, i_dv);
+                        }
+
+                        o_acc_local += type_convert<AccType>(p_vec[i_j]) * type_convert<AccType>(v);
+                    }
+                }
+                if constexpr(is_kvcache_i8_forward_quant)
+                {
+                    // apply pr scale to local acc
+                    o_acc_local =
+                        type_convert<AccType>(type_convert<float>(o_acc_local) * pf_scale);
+                }
+                o_acc += o_acc_local;
+            }
+        }
+
+        // post scale o_acc
+        {
+            SoftmaxType tmp = l == 0.f ? 0.f : 1.f / l; // in case masking
+            o_acc           = type_convert<AccType>(type_convert<SoftmaxType>(o_acc) * tmp);
+        }
+
+        // store O
+        if(i_dv < args.hdim_v)
+            o_addr.store(type_convert<OType>(o_acc), i_sq, i_dv);
+    }
+};
+
+#define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_()                                                        \
+    {                                                                                                       \
+        using ktraits_ =                                                                                    \
+            naive_attention_fwd_kernel_traits<static_cast<naive_attention_variation_enum>(                  \
+                variation_)>;                                                                               \
+        using k_   = naive_attention_fwd_kernel<q_type_,                                                    \
+                                              k_type_,                                                    \
+                                              v_type_,                                                    \
+                                              o_type_,                                                    \
+                                              acc_type_,                                                  \
+                                              q_layout_,                                                  \
+                                              k_layout_,                                                  \
+                                              v_layout_,                                                  \
+                                              o_layout_,                                                  \
+                                              ktraits_>;                                                  \
+        dim3 grids = k_::get_grid_size(a);                                                                  \
+        r          = ck_tile::launch_kernel(s,                                                              \
+                                   ck_tile::make_kernel(k_{}, grids, k_::get_block_size(), 0, a)); \
+    }
+
+#define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_()                                                 \
+    if(t.variation == 0 && t.q_layout == "bshd" && t.k_layout == "bshd" && t.v_layout == "bshd" && \
+       t.o_layout == "bshd")                                                                       \
+    {                                                                                              \
+        constexpr auto q_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr auto k_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr auto v_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr auto o_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr int variation_ = 0;                                                              \
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
+    }                                                                                              \
+    else if(t.variation == 0 && t.q_layout == "bhsd" && t.k_layout == "bhsd" &&                    \
+            t.v_layout == "bhsd" && t.o_layout == "bhsd")                                          \
+    {                                                                                              \
+        constexpr auto q_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto k_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto v_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto o_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr int variation_ = 0;                                                              \
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
+    }                                                                                              \
+    else if(t.variation == 2 && t.q_layout == "bhsd" && t.k_layout == "phdsx" &&                   \
+            t.v_layout == "phds" && t.o_layout == "bhsd")                                          \
+    {                                                                                              \
+        constexpr auto q_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto k_layout_ = naive_attention_layout_enum::PHDSX;                             \
+        constexpr auto v_layout_ = naive_attention_layout_enum::PHDS;                              \
+        constexpr auto o_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr int variation_ = 2;                                                              \
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
+    }
+
+//
+CK_TILE_HOST float naive_attention_fwd(naive_attention_fwd_traits t,
+                                       naive_attention_fwd_args a,
+                                       ck_tile::stream_config s)
+{
+    float r = -1;
+    // TODO: do not explicitly create too much instance!
+    if(t.q_type == "fp16" && t.k_type == "fp16" && t.v_type == "fp16" && t.o_type == "fp16")
+    {
+        using q_type_   = fp16_t;
+        using k_type_   = fp16_t;
+        using v_type_   = fp16_t;
+        using o_type_   = fp16_t;
+        using acc_type_ = float;
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    else if(t.q_type == "bf16" && t.k_type == "bf16" && t.v_type == "bf16" && t.o_type == "bf16")
+    {
+        using q_type_   = bf16_t;
+        using k_type_   = bf16_t;
+        using v_type_   = bf16_t;
+        using o_type_   = bf16_t;
+        using acc_type_ = float;
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    else if(t.q_type == "bf16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "bf16")
+    {
+        using q_type_   = bf16_t;
+        using k_type_   = int8_t;
+        using v_type_   = int8_t;
+        using o_type_   = bf16_t;
+        using acc_type_ = int32_t; // NOTE!
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    else if(t.q_type == "fp16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "fp16")
+    {
+        using q_type_   = fp16_t;
+        using k_type_   = int8_t;
+        using v_type_   = int8_t;
+        using o_type_   = fp16_t;
+        using acc_type_ = int32_t; // NOTE!
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    return r;
+}
+
+#undef CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_
+#undef CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_
+
+} // namespace ck_tile
--- a/include/ck_tile/remod.py
+++ b/include/ck_tile/remod.py
@@ -7,6 +7,7 @@ import copy

 NS = 'ck_tile'
 OPS = 'ops'
+REF = 'ref'
 OPS_COMMON = 'common' # common header will be duplicated into ops/* other module

 HEADER_COMMON = f"""// SPDX-License-Identifier: MIT
@@ -29,6 +30,9 @@ class submodule_t:
    def push(self, f):
        if len(f.parents) != 1: # ignore ./xxx.hpp
            mod = get_module(f)
+            # ref is supposed to include one header on demand
+            if mod == REF:
+                return
            if mod == OPS:
                if mod not in self.m.keys():
                    self.m[mod] = dict()

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -62,9 +62,9 @@ struct ReferenceGemm : public device::BaseOperator
            auto f_mk_kn_mn = [&](auto m, auto n) {
                const int K = arg.a_m_k_.mDesc.GetLengths()[1];

-                AccDataType v_acc = 0;
-                ComputeTypeA v_a  = 0;
-                ComputeTypeB v_b  = 0;
+                AccDataType v_acc{0};
+                ComputeTypeA v_a{0};
+                ComputeTypeB v_b{0};

                for(int k = 0; k < K; ++k)
                {
@@ -93,7 +93,7 @@ struct ReferenceGemm : public device::BaseOperator
                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
                }

-                CDataType v_c = 0;
+                CDataType v_c{0};

                arg.c_element_op_(v_c, v_acc);


--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
@@ -237,6 +237,206 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpaddin
                                                      PassThrough,
                                                      PassThrough>>>& instances);
 #endif
+
+#if(defined(CK_ENABLE_FP8))
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+
 template <typename ADataType,
          typename BDataType,
          typename CDataType,
@@ -327,6 +527,121 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemm_S
        }
 #endif

+#if(defined(CK_ENABLE_FP8))
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, half_t> &&
+                          is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -95,6 +95,45 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances);

+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
 void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
                                                  Col,
@@ -189,6 +228,124 @@ void add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_in
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
 #endif

 #if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)
@@ -262,7 +419,11 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            {
                add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
-                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2_instances(
                    op_ptrs);
                add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances(
                    op_ptrs);
@@ -334,12 +495,34 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            {
                add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_kn_mn_instances(
                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2_instances(
+                    op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<ELayout, Row>)
            {
                add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_instances(
                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2_instances(
+                    op_ptrs);
            }
        }
 #endif

--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once

 #include <cstdlib>

@@ -7,16 +9,17 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/utility/loop_scheduler.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using F16 = ck::half_t;
-using F32 = float;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;

 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -25,97 +28,109 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

 using Empty_Tuple = ck::Tuple<>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;

-using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+static constexpr auto PipelineV1         = ck::PipelineVersion::v1;
+static constexpr auto PipelineV2         = ck::PipelineVersion::v2;
+static constexpr auto DefaultScheduler   = ck::LoopScheduler::Default;
+static constexpr auto InterwaveScheduler = ck::LoopScheduler::Interwave;
+static constexpr auto GemmMNKPadding     = device::GemmSpecialization::MNKPadding;
+static constexpr auto GemmDefault        = device::GemmSpecialization::Default;

-using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instances = std::tuple<
+template <typename T,
+          device::GemmSpecialization GemmSpec = GemmMNKPadding,
+          PipelineVersion Pipeline            = PipelineV1,
+          LoopScheduler Scheduler             = DefaultScheduler,
+          enable_if_t<sizeof(T) == 2, bool>   = false>
+using device_grouped_gemm_xdl_splitk_2Bt_rrr_instances = std::tuple<
    // clang-format off
-        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1>,
-
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
+        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| Pipeline | Loop      |
+        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector| Version  | Scheduler |
+        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|          |           |
+        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |          |           |
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>
+    // clang-format on
+    >;

-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v2>
+template <typename T,
+          device::GemmSpecialization GemmSpec = GemmMNKPadding,
+          PipelineVersion Pipeline            = PipelineV1,
+          LoopScheduler Scheduler             = DefaultScheduler,
+          enable_if_t<sizeof(T) == 2, bool>   = false>
+using device_grouped_gemm_xdl_splitk_2Bt_rcr_instances = std::tuple<
+    // clang-format off
+        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| Pipeline | Loop      |
+        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector| Version  | Scheduler |
+        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|          |           |
+        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |          |           |
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, Pipeline,  Scheduler>
    // clang-format on
    >;

-void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Row,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F16,
-                                                  F16,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instances{});
-}
+template <typename T,
+          device::GemmSpecialization GemmSpec = GemmMNKPadding,
+          PipelineVersion Pipeline            = PipelineV1,
+          LoopScheduler Scheduler             = DefaultScheduler,
+          enable_if_t<sizeof(T) == 2, bool>   = false>
+using device_grouped_gemm_xdl_splitk_2Bt_crr_instances = std::tuple<
+    // clang-format off
+        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| Pipeline | Loop      |
+        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector| Version  | Scheduler |
+        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|          |           |
+        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |          |           |
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 2, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>
+    // clang-format on
+    >;

 } // namespace instance
 } // namespace device

--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -62,7 +62,7 @@ function(add_instance_library INSTANCE_NAME)
    endforeach()
    # Do not build mha instances if gfx94 or gfx90a targets are not on the target list
    foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND source MATCHES "mha")
+	    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND source MATCHES "mha")
         message("removing mha instance ${source} ")
         list(REMOVE_ITEM ARGN "${source}")
    endif()
@@ -87,6 +87,12 @@ function(add_instance_library INSTANCE_NAME)
         list(REMOVE_ITEM ARGN "${source}")
    endif()
    endforeach()
+    foreach(source IN LISTS ARGN)
+    if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "gemm_xdl_universal_streamk" AND source MATCHES "_f8_")
+         message("removing gemm_universal_streamk_f8 instance ${source} ")
+         list(REMOVE_ITEM ARGN "${source}")
+    endif()
+    endforeach()
    endif()
    #only continue if there are some source files left on the list
    if(ARGN)
@@ -340,7 +346,7 @@ if(CK_DEVICE_CONV_INSTANCES)
 endif()
 if(CK_DEVICE_MHA_INSTANCES)
        set(gpu_list ${INST_TARGETS})
-        if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a")
+	if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a")
            add_library(device_mha_operations STATIC ${CK_DEVICE_MHA_INSTANCES})
            add_library(composablekernels::device_mha_operations ALIAS device_mha_operations)
            target_compile_features(device_mha_operations PUBLIC)