:Merge branch 'muozturk_sk_padding' of...

:Merge branch 'muozturk_sk_padding' of https://github.com/ROCm/composable_kernel into muozturk_sk_padding

:Merge branch 'muozturk_sk_padding' of...
:Merge branch 'muozturk_sk_padding' of https://github.com/ROCm/composable_kernel into muozturk_sk_padding
1857761a · ozturkosu · 4c64fa6d · 715ffa67 · 1857761a · 1857761a
Commit 1857761a authored Feb 10, 2025 by ozturkosu
20 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+template <typename GridwiseGemm,
+          typename BatchedGemmArg,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_batched_gemm_b_scale_xdl_cshuffle_v3(BatchedGemmArg karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const index_t g_idx = blockIdx.z % karg.Batch;
+    const index_t k_idx = blockIdx.z / karg.Batch;
+
+    const auto a_batch_offset       = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
+    const auto b_batch_offset       = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
+    const auto c_batch_offset       = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+    const auto b_scale_batch_offset = karg.compute_ptr_offset_of_batch.GetSacleBPtrOffset(g_idx);
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
+        karg.p_c_grid + c_batch_offset + splitk_batch_offset.c_reduce_offset,
+        karg.p_b_scale_grid + b_scale_batch_offset + splitk_batch_offset.scale_k_split_offset,
+        p_shared,
+        karg);
+
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename GridwiseGemm,
+          typename BatchedGemmArg,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds(BatchedGemmArg karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    // Pass two lds pointer is the key to tell compiler that ds_read/write
+    // operate on different lds chunk at same time without order dependecy
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const index_t g_idx = blockIdx.z % karg.Batch;
+    const index_t k_idx = blockIdx.z / karg.Batch;
+
+    const auto a_batch_offset       = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
+    const auto b_batch_offset       = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
+    const auto c_batch_offset       = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+    const auto b_scale_batch_offset = karg.compute_ptr_offset_of_batch.GetSacleBPtrOffset(g_idx);
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
+        karg.p_c_grid + c_batch_offset + splitk_batch_offset.c_reduce_offset,
+        karg.p_b_scale_grid + b_scale_batch_offset + splitk_batch_offset.scale_k_split_offset,
+        p_shared_0,
+        p_shared_1,
+        karg);
+
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t ScaleBlockN, // scale block for N
+          index_t ScaleBlockK, // scale block for K
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
+    : public DeviceBatchedGemmV2BScale<ALayout,
+                                       BLayout,
+                                       CLayout,
+                                       ADataType,
+                                       BDataType,
+                                       BScaleDataType,
+                                       CDataType,
+                                       ScaleBlockN,
+                                       ScaleBlockK,
+                                       AElementwiseOperation,
+                                       BElementwiseOperation,
+                                       CElementwiseOperation>
+{
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        ScaleBlockN,
+        ScaleBlockK,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                       index_t BatchStrideB,
+                                       index_t BatchStrideC,
+                                       index_t BatchStrideScaleB)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideC_(BatchStrideC),
+              BatchStrideScaleB_(BatchStrideScaleB)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_) / BPackedSize;
+        }
+
+        __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+        __host__ __device__ constexpr long_index_t GetSacleBPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideScaleB_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideC_;
+        index_t BatchStrideScaleB_;
+    };
+
+    struct Argument : public GridwiseGemm::Argument
+    {
+        index_t Batch;
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch;
+
+        Argument(const ADataType* p_a_grid_,
+                 const BDataType* p_b_grid_,
+                 CDataType* p_c_grid_,
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 index_t StrideA_,
+                 index_t StrideB_,
+                 index_t StrideC_,
+                 index_t StrideScaleB_,
+                 index_t BatchStrideA_,
+                 index_t BatchStrideB_,
+                 index_t BatchStrideC_,
+                 index_t BatchStrideScaleB_,
+                 const BScaleDataType* p_b_scale_grid_,
+                 index_t Batch_,
+                 index_t KBatch_,
+                 AElementwiseOperation a_element_op_,
+                 BElementwiseOperation b_element_op_,
+                 CElementwiseOperation c_element_op_)
+            : GridwiseGemm::Argument(p_a_grid_,
+                                     p_b_grid_,
+                                     p_c_grid_,
+                                     M_,
+                                     N_,
+                                     K_,
+                                     StrideA_,
+                                     StrideB_,
+                                     StrideC_,
+                                     StrideScaleB_,
+                                     p_b_scale_grid_,
+                                     KBatch_, // KBatch
+                                     a_element_op_,
+                                     b_element_op_,
+                                     c_element_op_),
+              Batch(Batch_),
+              compute_ptr_offset_of_batch(
+                  BatchStrideA_, BatchStrideB_, BatchStrideC_, BatchStrideScaleB_)
+        {
+        }
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) =
+                GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.Batch * arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave
+                    ? (BlkGemmPipelineVer == BlockGemmPipelineVersion::v3 &&
+                       MPerBlock * NPerBlock * KPerBlock * sizeof(ADataType) <= 128 * 128 * 64 * 2)
+                          ? 2
+                          : 1
+                    : 2;
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            Argument,
+                            true,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            Argument,
+                            true,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                // Tail number could be One to Seven
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                }
+                // Tail number could be Odd or Even
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            Argument,
+                            false,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            Argument,
+                            false,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    index_t GetKPerBlock() override { return KPerBlock; }
+
+    bool GetPermuteB() override { return PermuteB; }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t StrideScaleB,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             index_t BatchStrideC,
+                             index_t BatchStrideScaleB,
+                             const BScaleDataType* p_b_scale,
+                             index_t Batch,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        StrideScaleB,
+                        BatchStrideA,
+                        BatchStrideB,
+                        BatchStrideC,
+                        BatchStrideScaleB,
+                        p_b_scale,
+                        Batch,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      index_t StrideScaleB,
+                                                      index_t BatchStrideA,
+                                                      index_t BatchStrideB,
+                                                      index_t BatchStrideC,
+                                                      index_t BatchStrideScaleB,
+                                                      const void* p_b_scale,
+                                                      index_t Batch,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          StrideScaleB,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideC,
+                                          BatchStrideScaleB,
+                                          static_cast<const BScaleDataType*>(p_b_scale),
+                                          Batch,
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceGemmXdlUniversal"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -138,6 +138,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
            if(stream_config.log_level_ > 0)
            {
                arg.Print();
+                GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
            }

            if(!GridwiseGemm::CheckValidity(arg))
@@ -733,7 +734,9 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
            << "BlkGemmPipelineVersion: "
            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
            << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "Kpack: "
+            << GridwiseGemm::BlockwiseGemmPipe::AMmaKStride;
        // clang-format on

        return str.str();

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -224,12 +224,12 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
        }();

        // Pad both M and K to be multiples of the block sizes
-        const auto a_grid_desc_m_k = transform_tensor_descriptor(
-            a_grid_desc_mraw_kraw,
-            make_tuple(make_right_pad_transform(M, MPad - M),
-                       make_right_pad_transform(K, KPad - K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        const auto a_grid_desc_m_k =
+            transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                        make_tuple(make_right_pad_transform(M, MPad - M),
+                                                   make_right_pad_transform(K, KPad - K)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));

        const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
            a_grid_desc_m_k,
@@ -322,14 +322,14 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
            }
        }();
-        
+
        // Pad both N and K to be multiples of the block sizes
-        const auto b_grid_desc_n_k = transform_tensor_descriptor(
-            b_grid_desc_nraw_kraw,
-            make_tuple(make_right_pad_transform(N, NPad - N),
-                       make_right_pad_transform(K, KPad - K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        const auto b_grid_desc_n_k =
+            transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                        make_tuple(make_right_pad_transform(N, NPad - N),
+                                                   make_right_pad_transform(K, KPad - K)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));

        const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
            b_grid_desc_n_k,
@@ -990,7 +990,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)  &&
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
        {
            if(!(karg.M % MPerBlock == 0))
@@ -1008,7 +1008,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)  &&
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
        {
            if(!(karg.N % NPerBlock == 0))
@@ -1075,7 +1075,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
                              << __LINE__ << ", in function: " << __func__ << std::endl;
                }
-                

                return false;
            }
@@ -1093,9 +1092,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                              << __LINE__ << ", in function: " << __func__ << std::endl;
                }
                std::cout << "Arg N (" << karg.N
-                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
-                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
                return false;
            }
        }
@@ -1110,7 +1109,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
                              << __LINE__ << ", in function: " << __func__ << std::endl;
                }
-             
+
                return false;
            }
        }
@@ -1128,7 +1127,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
                              << std::endl;
                }
-             
+
                return false;
            }
        }
@@ -1145,7 +1144,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
                              << std::endl;
                }
-               
+
                return false;
            }
        }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -37,7 +37,7 @@ __global__ void
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);

    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
@@ -70,7 +70,7 @@ __global__ void
    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];

-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);

    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
@@ -638,45 +638,45 @@ struct GridwiseGemm_xdl_cshuffle_v3
    struct SplitKBatchOffset
    {

-        __device__ SplitKBatchOffset(Argument& karg)
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
        {
            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
            {
-                a_k_split_offset = blockIdx.z * karg.KRead / APackedSize;
+                a_k_split_offset = k_id * karg.KRead / APackedSize;
            }
            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
            {
-                a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA;
+                a_k_split_offset = k_id * karg.KRead * karg.StrideA;
            }

            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
            {
-                b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB;
+                b_k_split_offset = k_id * karg.KRead * karg.StrideB;
            }
            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
            {
                if constexpr(!PermuteB)
                {
-                    b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
+                    b_k_split_offset = k_id * karg.KRead / BPackedSize;
                }
                else
                {
                    const int k0_offset = karg.KRead * karg.N;
-                    b_k_split_offset    = blockIdx.z * k0_offset / BPackedSize;
+                    b_k_split_offset    = k_id * k0_offset / BPackedSize;
                }
            }

            // Calculate B scale offset
            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
            {
-                scale_k_split_offset = blockIdx.z * (karg.KRead / ScaleBlockK) * karg.StrideB;
+                scale_k_split_offset = k_id * (karg.KRead / ScaleBlockK) * karg.StrideB;
            }
            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
            {
-                scale_k_split_offset = blockIdx.z * (karg.KRead / ScaleBlockK);
+                scale_k_split_offset = k_id * (karg.KRead / ScaleBlockK);
            }

-            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            if(k_id < (karg.KBatch - 1))
            {
                karg.K = karg.KRead;
            }
@@ -687,7 +687,7 @@ struct GridwiseGemm_xdl_cshuffle_v3

            if(karg.IsReduceAdd())
            {
-                c_reduce_offset = blockIdx.z * karg.M * karg.N;
+                c_reduce_offset = k_id * karg.M * karg.N;
            }
            else
            {

--- a/include/ck/utility/blkgemmpipe_scheduler.hpp
+++ b/include/ck/utility/blkgemmpipe_scheduler.hpp
@@ -90,14 +90,22 @@ struct BlockwiseGemmXdlops_pipeline_hotloop_inst
               KPerXDL);

        printf(" A/B buffer load inst: %d, %d\n A/B LDS write inst: %d, %d\n A/B LDS read inst: "
-               "%d, %d\n C MFMA inst: %d\n",
+               "%d, %d\n C MFMA inst: %d\n"
+               "A/B LDS read width: %d, %d, A/B LDS write width: %d, %d, A/B buffer load width: "
+               "%d/ %d\n",
               A_Buffer_Load_Inst_Num,
               B_Buffer_Load_Inst_Num,
               A_LDS_Write_Inst_Num,
               B_LDS_Write_Inst_Num,
               A_LDS_Read_Inst_Num,
               B_LDS_Read_Inst_Num,
-               C_MFMA_Inst_Num);
+               C_MFMA_Inst_Num,
+               A_LDS_Read_Width,
+               B_LDS_Read_Width,
+               ALDSWriteWidth,
+               BLDSWriteWidth,
+               ABufferLoadWidth,
+               BBufferLoadWidth);
    }
 };


--- a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
+++ b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
 #include "ck_tile/core/numeric/vector_type.hpp"
@@ -8,16 +8,75 @@

 namespace ck_tile {

-CK_TILE_HOST_DEVICE bf16_t add_bf16_t(const bf16_t& a, const bf16_t& b)
+template <typename T, typename ComputeType>
+CK_TILE_HOST_DEVICE T add(const T& a, const T& b)
 {
-    return type_convert<bf16_t>(type_convert<float>(a) + type_convert<float>(b));
+    return type_convert<T>(type_convert<ComputeType>(a) + type_convert<ComputeType>(b));
 }

 CK_TILE_HOST_DEVICE bf16x2_t add_bf16x2_t(const bf16x2_t& a, const bf16x2_t& b)
 {
    bf16x2_t rtn;
-    rtn[0] = add_bf16_t(a[0], b[0]);
-    rtn[1] = add_bf16_t(a[1], b[1]);
+    rtn[0] = add<bf16_t, float>(a[0], b[0]);
+    rtn[1] = add<bf16_t, float>(a[1], b[1]);
+    return rtn;
+}
+
+CK_TILE_HOST_DEVICE bf16x4_t add_bf16x4_t(const bf16x4_t& a, const bf16x4_t& b)
+{
+    bf16x4_t rtn;
+    rtn[0] = add<bf16_t, float>(a[0], b[0]);
+    rtn[1] = add<bf16_t, float>(a[1], b[1]);
+    rtn[2] = add<bf16_t, float>(a[2], b[2]);
+    rtn[3] = add<bf16_t, float>(a[3], b[3]);
+    return rtn;
+}
+
+CK_TILE_HOST_DEVICE fp8x4_t add_fp8x4_t(const fp8x4_t& a, const fp8x4_t& b)
+{
+    fp8x4_t rtn;
+    rtn[0] = add<fp8_t, float>(a[0], b[0]);
+    rtn[1] = add<fp8_t, float>(a[1], b[1]);
+    rtn[2] = add<fp8_t, float>(a[2], b[2]);
+    rtn[3] = add<fp8_t, float>(a[3], b[3]);
+    return rtn;
+}
+
+CK_TILE_HOST_DEVICE fp8x8_t add_fp8x8_t(const fp8x8_t& a, const fp8x8_t& b)
+{
+    fp8x8_t rtn;
+    rtn[0] = add<fp8_t, float>(a[0], b[0]);
+    rtn[1] = add<fp8_t, float>(a[1], b[1]);
+    rtn[2] = add<fp8_t, float>(a[2], b[2]);
+    rtn[3] = add<fp8_t, float>(a[3], b[3]);
+    rtn[4] = add<fp8_t, float>(a[4], b[4]);
+    rtn[5] = add<fp8_t, float>(a[5], b[5]);
+    rtn[6] = add<fp8_t, float>(a[6], b[6]);
+    rtn[7] = add<fp8_t, float>(a[7], b[7]);
+    return rtn;
+}
+
+CK_TILE_HOST_DEVICE bf8x4_t add_bf8x4_t(const bf8x4_t& a, const bf8x4_t& b)
+{
+    bf8x4_t rtn;
+    rtn[0] = add<bf8_t, float>(a[0], b[0]);
+    rtn[1] = add<bf8_t, float>(a[1], b[1]);
+    rtn[2] = add<bf8_t, float>(a[2], b[2]);
+    rtn[3] = add<bf8_t, float>(a[3], b[3]);
+    return rtn;
+}
+
+CK_TILE_HOST_DEVICE bf8x8_t add_bf8x8_t(const bf8x8_t& a, const bf8x8_t& b)
+{
+    bf8x8_t rtn;
+    rtn[0] = add<bf8_t, float>(a[0], b[0]);
+    rtn[1] = add<bf8_t, float>(a[1], b[1]);
+    rtn[2] = add<bf8_t, float>(a[2], b[2]);
+    rtn[3] = add<bf8_t, float>(a[3], b[3]);
+    rtn[4] = add<bf8_t, float>(a[4], b[4]);
+    rtn[5] = add<bf8_t, float>(a[5], b[5]);
+    rtn[6] = add<bf8_t, float>(a[6], b[6]);
+    rtn[7] = add<bf8_t, float>(a[7], b[7]);
    return rtn;
 }

@@ -59,6 +118,192 @@ CK_TILE_DEVICE void atomic_add<bf16x2_t>(bf16x2_t* p_dst, const bf16x2_t& x)
    } while(cur_v.u32 != old_v);
 }

+template <>
+CK_TILE_DEVICE void atomic_add<bf16x4_t>(bf16x4_t* p_dst, bf16x4_t const& x)
+{
+    // Union to treat the pointer as either bf16x4_t* or uint64_t*:
+    union U64BF164_ADDR
+    {
+        uint64_t* u64_a;
+        bf16x4_t* bf164_a;
+    };
+
+    // Union to treat the data as either bf16x4_t or 64-bit integer
+    union U64BF164
+    {
+        uint64_t u64;
+        bf16x4_t bf164;
+    };
+
+    U64BF164_ADDR addr;
+    addr.bf164_a = p_dst; // interpret p_dst as a 64-bit location
+
+    // First read (non-atomic) of the old value
+    U64BF164 cur_v;
+    cur_v.u64 = *addr.u64_a;
+
+    U64BF164 new_v_union;
+    uint64_t old_v, new_v;
+
+    do
+    {
+        // old 64 bits
+        old_v = cur_v.u64;
+
+        // Add elementwise in bf16
+        new_v_union.bf164 = add_bf16x4_t(cur_v.bf164, x);
+        new_v             = new_v_union.u64;
+
+        // Attempt the 64-bit CAS
+        cur_v.u64 = atomicCAS(addr.u64_a, old_v, new_v);
+
+    } while(cur_v.u64 != old_v);
+}
+
+template <>
+CK_TILE_DEVICE void atomic_add<fp8x4_t>(fp8x4_t* p_dst, const fp8x4_t& x)
+{
+    union U32FP84_ADDR
+    {
+        uint32_t* u32_a;
+        fp8x4_t* fp84_a;
+    };
+
+    union U32FP84
+    {
+        uint32_t u32;
+        fp8x4_t fp84;
+    };
+
+    U32FP84_ADDR dword_addr;
+    U32FP84 cur_v;
+    U32FP84 new_;
+    uint32_t old_v, new_v;
+
+    dword_addr.fp84_a = p_dst;
+    cur_v.u32         = *dword_addr.u32_a;
+
+    do
+    {
+        old_v     = cur_v.u32;
+        new_.fp84 = add_fp8x4_t(cur_v.fp84, x);
+        new_v     = new_.u32;
+        cur_v.u32 = atomicCAS(dword_addr.u32_a, old_v, new_v);
+    } while(cur_v.u32 != old_v);
+}
+
+template <>
+CK_TILE_DEVICE void atomic_add<bf8x4_t>(bf8x4_t* p_dst, const bf8x4_t& x)
+{
+    union U32BF84_ADDR
+    {
+        uint32_t* u32_a;
+        bf8x4_t* bf84_a;
+    };
+
+    union U32BF84
+    {
+        uint32_t u32;
+        bf8x4_t bf84;
+    };
+
+    U32BF84_ADDR dword_addr;
+    U32BF84 cur_v;
+    U32BF84 new_;
+    uint32_t old_v, new_v;
+
+    dword_addr.bf84_a = p_dst;
+    cur_v.u32         = *dword_addr.u32_a;
+
+    do
+    {
+        old_v     = cur_v.u32;
+        new_.bf84 = add_bf8x4_t(cur_v.bf84, x);
+        new_v     = new_.u32;
+        cur_v.u32 = atomicCAS(dword_addr.u32_a, old_v, new_v);
+    } while(cur_v.u32 != old_v);
+}
+
+//
+// Atomic add for fp8x8_t
+//
+template <>
+CK_TILE_DEVICE void atomic_add<fp8x8_t>(fp8x8_t* p_dst, fp8x8_t const& x)
+{
+    // Union for addressing 64 bits as either "fp8x8_t" or a 64-bit integer.
+    union U64FP88_ADDR
+    {
+        uint64_t* u64_a; // pointer to 64-bit integer
+        fp8x8_t* fp88_a; // pointer to fp8x8_t
+    };
+
+    union U64FP88
+    {
+        uint64_t u64;
+        fp8x8_t fp88;
+    };
+
+    U64FP88_ADDR dword_addr;
+    U64FP88 cur_v;
+    U64FP88 new_v_union;
+    uint64_t old_v, new_v;
+
+    // Point to the destination as both fp8x8_t* and uint64_t*.
+    dword_addr.fp88_a = p_dst;
+    // Initial read of 64 bits from memory
+    cur_v.u64 = *dword_addr.u64_a;
+
+    do
+    {
+        old_v = cur_v.u64;
+        // Add each fp8 element using your add_fp8x8_t(...) routine
+        new_v_union.fp88 = add_fp8x8_t(cur_v.fp88, x);
+        new_v            = new_v_union.u64;
+
+        // Attempt 64-bit CAS
+        cur_v.u64 = atomicCAS(dword_addr.u64_a, old_v, new_v);
+    } while(cur_v.u64 != old_v);
+}
+
+//
+// Atomic add for bf8x8_t
+//
+template <>
+CK_TILE_DEVICE void atomic_add<bf8x8_t>(bf8x8_t* p_dst, bf8x8_t const& x)
+{
+    union U64BF88_ADDR
+    {
+        uint64_t* u64_a;
+        bf8x8_t* bf88_a;
+    };
+
+    union U64BF88
+    {
+        uint64_t u64;
+        bf8x8_t bf88;
+    };
+
+    U64BF88_ADDR dword_addr;
+    U64BF88 cur_v;
+    U64BF88 new_v_union;
+    uint64_t old_v, new_v;
+
+    dword_addr.bf88_a = p_dst;
+    // Read the original 64 bits
+    cur_v.u64 = *dword_addr.u64_a;
+
+    do
+    {
+        old_v = cur_v.u64;
+        // Add each bf8 element using your add_bf8x8_t(...) routine
+        new_v_union.bf88 = add_bf8x8_t(cur_v.bf88, x);
+        new_v            = new_v_union.u64;
+
+        // 64-bit CAS loop
+        cur_v.u64 = atomicCAS(dword_addr.u64_a, old_v, new_v);
+    } while(cur_v.u64 != old_v);
+}
+
 template <typename T, index_t N>
 CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
 {
@@ -66,8 +311,10 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
                      (std::is_same<T, uint32_t>::value && (N == 1)) ||
                      (std::is_same<T, float>::value && (N == 1 || N == 2)) ||
                      (std::is_same<T, double>::value && (N == 1 || N == 2)) ||
-                      (std::is_same<T, bf16_t>::value && (N == 2 || N == 4)),
-                  "wrong! not implemented");
+                      (std::is_same<T, bf16_t>::value && (N == 2 || N == 4 || N == 8)) ||
+                      (std::is_same<T, fp8_t>::value && (N == 4 || N == 8 || N == 16)) ||
+                      (std::is_same<T, bf8_t>::value && (N == 4 || N == 8 || N == 16)),
+                  "The granularity of the thread buffer is unsupported on the hardware!");

    constexpr auto I0 = number<0>{};
    constexpr auto I1 = number<1>{};
@@ -118,9 +365,45 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
        }
        else if constexpr(N == 4)
        {
-            atomic_add(c_style_pointer_cast<bf16x2_t*>(p_dst), x.template get_as<bf16x2_t>()[I0]);
-            atomic_add(c_style_pointer_cast<bf16x2_t*>(p_dst) + 1,
-                       x.template get_as<bf16x2_t>()[I1]);
+            atomic_add(c_style_pointer_cast<bf16x4_t*>(p_dst), x.template get_as<bf16x4_t>()[I0]);
+        }
+        else if constexpr(N == 8)
+        {
+            atomic_add(c_style_pointer_cast<bf16x4_t*>(p_dst), x.template get_as<bf16x4_t>()[I0]);
+            atomic_add(c_style_pointer_cast<bf16x4_t*>(p_dst) + 1,
+                       x.template get_as<bf16x4_t>()[I1]);
+        }
+    }
+    else if constexpr(std::is_same<T, fp8_t>::value)
+    {
+        if constexpr(N == 4)
+        {
+            atomic_add(c_style_pointer_cast<fp8x4_t*>(p_dst), x.template get_as<fp8x4_t>()[I0]);
+        }
+        if constexpr(N == 8)
+        {
+            atomic_add(c_style_pointer_cast<fp8x8_t*>(p_dst), x.template get_as<fp8x8_t>()[I0]);
+        }
+        if constexpr(N == 16)
+        {
+            atomic_add(c_style_pointer_cast<fp8x8_t*>(p_dst), x.template get_as<fp8x8_t>()[I0]);
+            atomic_add(c_style_pointer_cast<fp8x8_t*>(p_dst) + 1, x.template get_as<fp8x8_t>()[I1]);
+        }
+    }
+    else if constexpr(std::is_same<T, bf8_t>::value)
+    {
+        if constexpr(N == 4)
+        {
+            atomic_add(c_style_pointer_cast<bf8x4_t*>(p_dst), x.template get_as<bf8x4_t>()[I0]);
+        }
+        if constexpr(N == 8)
+        {
+            atomic_add(c_style_pointer_cast<bf8x8_t*>(p_dst), x.template get_as<bf8x8_t>()[I0]);
+        }
+        if constexpr(N == 16)
+        {
+            atomic_add(c_style_pointer_cast<bf8x8_t*>(p_dst), x.template get_as<bf8x8_t>()[I0]);
+            atomic_add(c_style_pointer_cast<bf8x8_t*>(p_dst) + 1, x.template get_as<bf8x8_t>()[I1]);
        }
    }
 }

--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -20,6 +20,7 @@
 #include "ck_tile/host/reference/reference_batched_masking.hpp"
 #include "ck_tile/host/reference/reference_batched_rotary_position_embedding.hpp"
 #include "ck_tile/host/reference/reference_batched_softmax.hpp"
+#include "ck_tile/host/reference/reference_batched_transpose.hpp"
 #include "ck_tile/host/reference/reference_elementwise.hpp"
 #include "ck_tile/host/reference/reference_fused_moe.hpp"
 #include "ck_tile/host/reference/reference_gemm.hpp"
@@ -34,4 +35,3 @@
 #include "ck_tile/host/reference/reference_topk.hpp"
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/timer.hpp"
-#include "ck_tile/host/reference/reference_batched_transpose.hpp"
--- a/include/ck_tile/host/check_err.hpp
+++ b/include/ck_tile/host/check_err.hpp
@@ -22,13 +22,14 @@ template <typename ComputeDataType, typename OutDataType, typename AccDataType =
 double get_relative_threshold(const int number_of_accumulations = 1)
 {
    using F8   = ck_tile::fp8_t;
+    using BF8  = ck_tile::bf8_t;
    using F16  = ck_tile::half_t;
    using BF16 = ck_tile::bf16_t;
    using F32  = float;
    using I8   = int8_t;
    using I32  = int32_t;

-    static_assert(is_any_of<ComputeDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
                  "Warning: Unhandled ComputeDataType for setting up the relative threshold!");

    double compute_error = 0;
@@ -41,7 +42,7 @@ double get_relative_threshold(const int number_of_accumulations = 1)
        compute_error = std::pow(2, -numeric_traits<ComputeDataType>::mant) * 0.5;
    }

-    static_assert(is_any_of<OutDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
                  "Warning: Unhandled OutDataType for setting up the relative threshold!");

    double output_error = 0;
@@ -55,7 +56,7 @@ double get_relative_threshold(const int number_of_accumulations = 1)
    }
    double midway_error = std::max(compute_error, output_error);

-    static_assert(is_any_of<AccDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
                  "Warning: Unhandled AccDataType for setting up the relative threshold!");

    double acc_error = 0;
@@ -74,13 +75,14 @@ template <typename ComputeDataType, typename OutDataType, typename AccDataType =
 double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
 {
    using F8   = ck_tile::fp8_t;
+    using BF8  = ck_tile::bf8_t;
    using F16  = ck_tile::half_t;
    using BF16 = ck_tile::bf16_t;
    using F32  = float;
    using I8   = int8_t;
    using I32  = int32_t;

-    static_assert(is_any_of<ComputeDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
                  "Warning: Unhandled ComputeDataType for setting up the absolute threshold!");

    auto expo            = std::log2(std::abs(max_possible_num));
@@ -94,7 +96,7 @@ double get_absolute_threshold(const double max_possible_num, const int number_of
        compute_error = std::pow(2, expo - numeric_traits<ComputeDataType>::mant) * 0.5;
    }

-    static_assert(is_any_of<OutDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
                  "Warning: Unhandled OutDataType for setting up the absolute threshold!");

    double output_error = 0;
@@ -108,7 +110,7 @@ double get_absolute_threshold(const double max_possible_num, const int number_of
    }
    double midway_error = std::max(compute_error, output_error);

-    static_assert(is_any_of<AccDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
                  "Warning: Unhandled AccDataType for setting up the absolute threshold!");

    double acc_error = 0;
@@ -501,7 +503,11 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
    }
    if(!res)
    {
-        std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+        const float error_percent =
+            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
+        std::cerr << "max err: " << max_err;
+        std::cerr << ", number of errors: " << err_count;
+        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
    }
    return res;
 }

--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -80,13 +80,14 @@ __global__ void naive_gemm_kernel(ADataType* A,
            int b_index = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
                              ? col * strideB + k
                              : k * strideB + col;
-            acc += static_cast<AccDataType>(A[a_index]) * static_cast<AccDataType>(B[b_index]);
+            acc += ck_tile::type_convert<AccDataType>(A[a_index]) *
+                   ck_tile::type_convert<AccDataType>(B[b_index]);
        }

        int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
                          ? row * strideC + col
                          : col * strideC + row;
-        C[c_index]  = acc;
+        C[c_index]  = ck_tile::type_convert<CDataType>(acc);
    }
 }


--- a/include/ck_tile/ops/batched_transpose.hpp
+++ b/include/ck_tile/ops/batched_transpose.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -77,6 +77,7 @@ struct CShuffleEpilogue
     *
     * @return The vector store size for C tensor.
     */
+    template <typename ODataType>
    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeC()
    {
        constexpr index_t MaxVectorStoreSize = 16;
@@ -142,7 +143,7 @@ struct CShuffleEpilogue
            TileDistributionEncodingPattern2D<kBlockSize,
                                              kMPerIteration,
                                              kNPerIteration,
-                                              GetVectorSizeC(),
+                                              GetVectorSizeC<ODataType>(),
                                              tile_distribution_pattern::thread_raked>;
        constexpr auto dram_tile_distribution = TileEncodingPattern::Make2DStaticTileDistribution();


--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -79,7 +79,10 @@ struct BlockUniversalGemmAsBsCr
        // TODO: Should we have two policies? Interwave & Intrawave ??
        static constexpr index_t InterWaveSchedulingMacClusters = 1;

-        static constexpr index_t KPack      = WarpGemm::kKPerThread;
+        // should be at least equal to: WarpGemm::Impl::kABKPerLane
+        // and the question is how to assess upper limit or exact value?
+        // TODO: Should we introduce AK1/BK1 parameters ?
+        static constexpr index_t KPack      = 8;
        static constexpr index_t KPerThread = KIterPerWarp * KPack;
        static constexpr index_t KRepeat    = KPerThread / KPack;
    };

--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -159,7 +159,7 @@ struct GemmKernel

    CK_TILE_HOST static bool IsSupportedArgument(const GemmKernelArgs& kargs)
    {
-        if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+        if constexpr(EpiloguePipeline::template GetVectorSizeC<CDataType>() % 2 != 0 &&
                     is_any_of<CDataType, fp16_t, bf16_t>::value)
        {
            if(kargs.k_batch != 1)
@@ -240,7 +240,7 @@ struct GemmKernel
                          << std::endl;
                return false;
            }
-            if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
+            if(kargs.N % EpiloguePipeline::template GetVectorSizeC<CDataType>() != 0)
            {
                std::cerr << "N is not a multiple of vector load size for C tensor!" << std::endl;
                return false;
@@ -255,7 +255,7 @@ struct GemmKernel
                          << std::endl;
                return false;
            }
-            if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
+            if(kargs.M % EpiloguePipeline::template GetVectorSizeC<CDataType>() != 0)
            {
                std::cerr << "M is not a multiple of vector load size for C tensor!" << std::endl;
                return false;
@@ -321,7 +321,7 @@ struct GemmKernel
                    c_ptr,
                    make_tuple(kargs.M, kargs.N),
                    make_tuple(kargs.stride_C, 1),
-                    number<EpiloguePipeline::GetVectorSizeC()>{},
+                    number<EpiloguePipeline::template GetVectorSizeC<CDataType>()>{},
                    number<1>{});
            }
            else
@@ -519,7 +519,7 @@ struct GemmKernel
        {
            // Do not compile in case where we have unsupported
            // VectorSizeC & data type configuration.
-            if constexpr(!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+            if constexpr(!(EpiloguePipeline::template GetVectorSizeC<CDataType>() % 2 != 0 &&
                           is_any_of<CDataType, fp16_t, bf16_t>::value))
            {
                RunGemm<memory_operation_enum::atomic_add>(

--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -3,6 +3,9 @@

 #pragma once

+#include <string>
+#include <sstream>
+
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
@@ -83,6 +86,56 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
        return Policy::template GetSmemSize<Problem>();
    }

+    CK_TILE_HOST static std::string Print()
+    {
+        constexpr index_t MPerXDL = BlockGemm::WarpGemm::kM;
+        constexpr index_t NPerXDL = BlockGemm::WarpGemm::kN;
+        constexpr index_t KPerXDL = BlockGemm::WarpGemm::WarpGemmAttribute::Impl::kK;
+
+        constexpr index_t WaveSize = 64;
+        constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
+        constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
+
+        // Below should be equal to AK1|BK1
+        constexpr index_t A_LDS_Read_Width = Policy::template GetSmemPackA<Problem>();
+        constexpr index_t B_LDS_Read_Width = Policy::template GetSmemPackB<Problem>();
+
+        constexpr index_t A_LDS_Write_Width = Policy::template GetSmemPackA<Problem>();
+        constexpr index_t B_LDS_Write_Width = Policy::template GetSmemPackB<Problem>();
+
+        constexpr index_t A_Buffer_Load_Inst_Num =
+            MPerBlock * KPerBlock / (BlockSize * GetVectorSizeA());
+        constexpr index_t B_Buffer_Load_Inst_Num =
+            NPerBlock * KPerBlock / (BlockSize * GetVectorSizeB());
+
+        constexpr index_t A_LDS_Write_Inst_Num =
+            MPerBlock * KPerBlock / (BlockSize * A_LDS_Write_Width);
+        constexpr index_t B_LDS_Write_Inst_Num =
+            NPerBlock * KPerBlock / (BlockSize * B_LDS_Write_Width);
+
+        constexpr index_t A_LDS_Read_Inst_Num =
+            WaveNumN * MPerBlock * KPerBlock / (BlockSize * A_LDS_Read_Width);
+        constexpr index_t B_LDS_Read_Inst_Num =
+            WaveNumM * MPerBlock * KPerBlock / (BlockSize * B_LDS_Read_Width);
+
+        constexpr index_t C_MFMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock /
+                                            (BlockSize / WaveSize) / (MPerXDL * NPerXDL * KPerXDL);
+
+        auto str = std::stringstream{};
+
+        str << "A/B vector size: " << GetVectorSizeA() << ", " << GetVectorSizeB() << "\n"
+            << "A/B LDS read/write width: " << A_LDS_Read_Width << ", " << B_LDS_Read_Width << "\n"
+            << "A/B buffer load inst: " << A_Buffer_Load_Inst_Num << ", " << B_Buffer_Load_Inst_Num
+            << "\n"
+            << "A/B LDS write inst: " << A_LDS_Write_Inst_Num << ", " << B_LDS_Write_Inst_Num
+            << "\n"
+            << "A/B LDS read inst: " << A_LDS_Read_Inst_Num << ", " << B_LDS_Read_Inst_Num << "\n"
+            << "C MFMA inst: " << C_MFMA_Inst_Num << "\n"
+            << "KPack: " << BlockGemm::Traits::KPack << "\n"
+            << "PrefetchStages: " << PrefetchStages << "\n";
+        return str.str();
+    }
+
    template <GemmPipelineScheduler Scheduler>
    struct PipelineImpl : public PipelineImplBase
    {
@@ -95,29 +148,35 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>

        CK_TILE_DEVICE static constexpr auto HotLoopScheduler()
        {
-            constexpr index_t MPerXDL = BlockGemmShape::WarpTile::at(I0{});
-            constexpr index_t NPerXDL = BlockGemmShape::WarpTile::at(I1{});
-            constexpr index_t KPerXDL = BlockGemmShape::WarpTile::at(I2{});
+            constexpr index_t MPerXDL = BlockGemm::WarpGemm::kM;
+            constexpr index_t NPerXDL = BlockGemm::WarpGemm::kN;
+            constexpr index_t KPerXDL = BlockGemm::WarpGemm::WarpGemmAttribute::Impl::kK;

            constexpr index_t WaveSize = 64;
            constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
            constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});

-            constexpr index_t A_LDS_Read_Width = KPerXDL;
-            constexpr index_t B_LDS_Read_Width = KPerXDL;
+            // Below should be equal to AK1|BK1
+            constexpr index_t A_LDS_Read_Width = Policy::template GetSmemPackA<Problem>();
+            constexpr index_t B_LDS_Read_Width = Policy::template GetSmemPackB<Problem>();
+
+            constexpr index_t A_LDS_Write_Width = Policy::template GetSmemPackA<Problem>();
+            constexpr index_t B_LDS_Write_Width = Policy::template GetSmemPackB<Problem>();

            constexpr index_t A_Buffer_Load_Inst_Num =
                MPerBlock * KPerBlock / (BlockSize * GetVectorSizeA());
            constexpr index_t B_Buffer_Load_Inst_Num =
                NPerBlock * KPerBlock / (BlockSize * GetVectorSizeB());

-            constexpr index_t A_LDS_Write_Inst_Num = MPerBlock * KPerBlock / (BlockSize * KPerXDL);
-            constexpr index_t B_LDS_Write_Inst_Num = NPerBlock * KPerBlock / (BlockSize * KPerXDL);
+            constexpr index_t A_LDS_Write_Inst_Num =
+                MPerBlock * KPerBlock / (BlockSize * A_LDS_Write_Width);
+            constexpr index_t B_LDS_Write_Inst_Num =
+                NPerBlock * KPerBlock / (BlockSize * B_LDS_Write_Width);

            constexpr index_t A_LDS_Read_Inst_Num =
-                WaveNumN * MPerBlock * KPerBlock / (BlockSize * KPerXDL);
+                WaveNumN * MPerBlock * KPerBlock / (BlockSize * A_LDS_Read_Width);
            constexpr index_t B_LDS_Read_Inst_Num =
-                WaveNumM * MPerBlock * KPerBlock / (BlockSize * KPerXDL);
+                WaveNumM * MPerBlock * KPerBlock / (BlockSize * B_LDS_Read_Width);

            constexpr index_t C_MFMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock /
                                                (BlockSize / WaveSize) /

--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -185,7 +185,6 @@ struct UniversalGemmPipelineAgBgCrPolicy
    template <typename Problem>
    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
    {
-
        using ADataType = remove_cvref_t<typename Problem::ADataType>;

        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ComputeTypeA = CDataType,
+          typename ComputeTypeB = ComputeTypeA>
+struct ReferenceMXGemm : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_m_k,
+                 const Tensor<ScaleDataType>& a_m_kblock_scales,
+                 const Tensor<BDataType>& b_k_n,
+                 const Tensor<ScaleDataType>& b_kblock_n_scales,
+                 Tensor<CDataType>& c_m_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : a_m_k_{a_m_k},
+              a_m_kblock_scales_{a_m_kblock_scales},
+              b_k_n_{b_k_n},
+              b_kblock_n_scales_{b_kblock_n_scales},
+              c_m_n_{c_m_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_m_k_;
+        const Tensor<ScaleDataType>& a_m_kblock_scales_;
+        const Tensor<BDataType>& b_k_n_;
+        const Tensor<ScaleDataType>& b_kblock_n_scales_;
+        Tensor<CDataType>& c_m_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceMXGemm::Argument;
+
+        float Run(const Argument& arg)
+        {
+            using GemmInstance = ck::tensor_operation::host::ReferenceGemm<ComputeTypeA,
+                                                                           ComputeTypeB,
+                                                                           CDataType,
+                                                                           AccDataType,
+                                                                           AElementwiseOperation,
+                                                                           BElementwiseOperation,
+                                                                           CElementwiseOperation,
+                                                                           ComputeTypeA,
+                                                                           ComputeTypeB>;
+
+            Tensor<ComputeTypeA> a_m_k_scaled(arg.a_m_k_.mDesc);
+            Tensor<ComputeTypeB> b_k_n_scaled(arg.b_k_n_.mDesc);
+
+            const auto M           = arg.a_m_k_.mDesc.GetLengths()[0];
+            const auto N           = arg.b_k_n_.mDesc.GetLengths()[1];
+            const auto K           = arg.a_m_k_.mDesc.GetLengths()[1];
+            const auto SCALE_BLOCK = K / arg.a_m_kblock_scales_.mDesc.GetLengths()[1];
+
+            for(size_t m = 0; m < M; m++)
+            {
+                for(size_t k = 0; k < K; k++)
+                {
+                    a_m_k_scaled(m, k) =
+                        type_convert<ComputeTypeA>(arg.a_m_k_(m, k)) *
+                        type_convert<ComputeTypeA>(arg.a_m_kblock_scales_(m, k / SCALE_BLOCK));
+                }
+            }
+
+            for(size_t n = 0; n < N; n++)
+            {
+                for(size_t k = 0; k < K; k++)
+                {
+                    b_k_n_scaled(k, n) =
+                        type_convert<ComputeTypeB>(arg.b_k_n_(k, n)) *
+                        type_convert<ComputeTypeB>(arg.b_kblock_n_scales_(k / SCALE_BLOCK, n));
+                }
+            }
+
+            auto ref_gemm     = GemmInstance{};
+            auto ref_invoker  = ref_gemm.MakeInvoker();
+            auto ref_argument = ref_gemm.MakeArgument(a_m_k_scaled,
+                                                      b_k_n_scaled,
+                                                      arg.c_m_n_,
+                                                      arg.a_element_op_,
+                                                      arg.b_element_op_,
+                                                      arg.c_element_op_);
+
+            ref_invoker.Run(ref_argument);
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
+                             const Tensor<ScaleDataType>& a_m_kblock_scales,
+                             const Tensor<BDataType>& b_k_n,
+                             const Tensor<ScaleDataType>& b_kblock_n_scales,
+                             Tensor<CDataType>& c_m_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{a_m_k,
+                        a_m_kblock_scales,
+                        b_k_n,
+                        b_kblock_n_scales,
+                        c_m_n,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceMXGemm"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_b_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_b_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include <memory>
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+#if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8))
+void add_device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2BScale<Row,
+                                                          Col,
+                                                          Row,
+                                                          F16,
+                                                          I4,
+                                                          F16,
+                                                          F16,
+                                                          1,
+                                                          128,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+#endif
+
+template <typename ADataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          index_t ScaleBlockK>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatchedGemmV2BScale<
+    ALayout,
+    BLayout,
+    CLayout,
+    ADataType,
+    BDataType,
+    BScaleDataType,
+    CDataType,
+    1,
+    ScaleBlockK,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceBatchedGemmV2BScale<ALayout,
+                                               BLayout,
+                                               CLayout,
+                                               ADataType,
+                                               BDataType,
+                                               BScaleDataType,
+                                               CDataType,
+                                               1,
+                                               ScaleBlockK,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, pk_i4_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_b_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_b_scale/CMakeLists.txt
+# ONLY XDL_KERNELS
+set(BATCHED_GEMM_B_SCALE_INSTANCES)
+
+list(APPEND BATCHED_GEMM_B_SCALE_INSTANCES 
+        device_batched_gemm_b_scale_xdl_f16_i4_f16/device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
+        )
+
+set_source_files_properties(device_batched_gemm_b_scale_xdl_f16_i4_f16/device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+add_instance_library(device_batched_gemm_b_scale_instance ${BATCHED_GEMM_B_SCALE_INSTANCES})
\ No newline at end of file
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_b_scale/device_batched_gemm_b_scale_xdl_f16_i4_f16/device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_b_scale/device_batched_gemm_b_scale_xdl_f16_i4_f16/device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I4  = pk_i4_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |   Type|      |        |         |   Operation|   Operation|   Operation|              |      |     N|     K|      |      |      |    |    |Wave| Wave|     |     | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |       |      |        |         |            |            |            |              |      |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+        
+        //Compute friendly
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,   128,   8,   32,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //0
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,    64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //1
+ 
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,    64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<2, 128, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //3
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,    64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<2, 128, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //4
+        
+        //Latency friendly
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //5
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //6
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //7
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   32,   128,   8,   32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //8
+        
+        // Memory friendly v3
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,   128,   32,   128,   8,   32,  32,   32,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //9
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,   128,   16,   128,   8,   16,  16,   16,    4,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //10
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   32,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //11
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   16,   128,   8,   16,  16,   16,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //12
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //13
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   32,   128,   8,   32,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //16
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   64,   128,   8,   32,  16,   16,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //17
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   64,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //18
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,  128,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //19
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,  128,   128,   8,   32,  32,   32,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //20
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,    16,  256,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //21
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,    32,  256,   128,   8,   32,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //22
+
+        // Memory friendly v4
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   32,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //23
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   16,   128,   8,   16,  16,   16,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //24
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //25
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //26
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   32,   128,   8,   32,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //28
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   64,   128,   8,   32,  16,   16,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //29
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   64,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //30
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,  128,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //31
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,  128,   128,   8,   32,  32,   32,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //32
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,    16,  256,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //33
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,    32,  256,   128,   8,   32,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //34
+
+        //new Compute friendly kernel
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,   128,   64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<2, 128, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //35
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,   128,   64,   8,   32,  32,   32,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<2, 128, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //36
+
+        //new Memory friendly kernel
+        DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   16,    64,   256,   8,   32,  16,   16,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>  //37
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_b_scale/device_batched_gemm_b_scale_xdl_f16_i4_f16/device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_b_scale/device_batched_gemm_b_scale_xdl_f16_i4_f16/device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2BScale<Row,
+                                                          Col,
+                                                          Row,
+                                                          F16,
+                                                          I4,
+                                                          F16,
+                                                          F16,
+                                                          1,
+                                                          128,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances<Intrawave,
+                                                                          GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck