Merge branch 'develop' into ck_host_lib

f1e53807 · Illia Silin · GitHub · 7450417d · d9f1ead3 · f1e53807
Unverified Commit f1e53807 authored Feb 10, 2025 by Illia Silin Committed by GitHub Feb 10, 2025
20 changed files
--- a/library/include/ck/library/utility/fill.hpp
+++ b/library/include/ck/library/utility/fill.hpp
--- a/library/include/ck/library/utility/host_common_util.hpp
+++ b/library/include/ck/library/utility/host_common_util.hpp
--- a/library/include/ck/library/utility/host_gemm.hpp
+++ b/library/include/ck/library/utility/host_gemm.hpp
--- a/library/include/ck/library/utility/host_tensor.hpp
+++ b/library/include/ck/library/utility/host_tensor.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -44,10 +44,19 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
        else
            os << delim;

-        if constexpr(std::is_same_v<T, ck::f8_t> || std::is_same_v<T, ck::bf8_t>)
+        using RangeType = ck::remove_cvref_t<decltype(v)>;
+        if constexpr(std::is_same_v<RangeType, ck::f8_t> || std::is_same_v<RangeType, ck::bf8_t> ||
+                     std::is_same_v<RangeType, ck::bhalf_t>)
        {
            os << ck::type_convert<float>(v);
        }
+        else if constexpr(std::is_same_v<RangeType, ck::pk_i4_t>)
+        {
+            const auto packed_floats = ck::type_convert<ck::float2_t>(v);
+            const ck::vector_type<float, 2> vector_of_floats{packed_floats};
+            os << vector_of_floats.template AsType<float>()[ck::Number<0>{}] << delim
+               << vector_of_floats.template AsType<float>()[ck::Number<1>{}];
+        }
        else
        {
            os << static_cast<T>(v);
@@ -266,18 +275,18 @@ struct Tensor
    using Data       = std::vector<T>;

    template <typename X>
-    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize())
+    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(GetElementSpaceSize())
    {
    }

    template <typename X, typename Y>
    Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
-        : mDesc(lens, strides), mData(mDesc.GetElementSpaceSize())
+        : mDesc(lens, strides), mData(GetElementSpaceSize())
    {
    }

    template <typename Lengths>
-    Tensor(const Lengths& lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize())
+    Tensor(const Lengths& lens) : mDesc(lens), mData(GetElementSpaceSize())
    {
    }

@@ -287,7 +296,7 @@ struct Tensor
    {
    }

-    Tensor(const Descriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpaceSize()) {}
+    Tensor(const Descriptor& desc) : mDesc(desc), mData(GetElementSpaceSize()) {}

    template <typename OutT>
    Tensor<OutT> CopyAsType() const
@@ -322,11 +331,21 @@ struct Tensor

    std::size_t GetElementSize() const { return mDesc.GetElementSize(); }

-    std::size_t GetElementSpaceSize() const { return mDesc.GetElementSpaceSize(); }
+    std::size_t GetElementSpaceSize() const
+    {
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return (mDesc.GetElementSpaceSize() + 1) / 2;
+        }
+        else
+        {
+            return mDesc.GetElementSpaceSize();
+        }
+    }

    std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); }

-    void SetZero() { ck::ranges::fill<T>(mData, 0); }
+    void SetZero() { ck::ranges::fill<T>(mData, T{0}); }

    template <typename F>
    void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
@@ -469,29 +488,64 @@ struct Tensor
    template <typename... Is>
    std::size_t GetOffsetFromMultiIndex(Is... is) const
    {
-        return mDesc.GetOffsetFromMultiIndex(is...);
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return mDesc.GetOffsetFromMultiIndex(is...) / 2;
+        }
+        else
+        {
+            return mDesc.GetOffsetFromMultiIndex(is...);
+        }
    }

    template <typename... Is>
    T& operator()(Is... is)
    {
-        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(is...) / 2];
+        }
+        else
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        }
    }

    template <typename... Is>
    const T& operator()(Is... is) const
    {
-        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(is...) / 2];
+        }
+        else
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        }
    }

    T& operator()(std::vector<std::size_t> idx)
    {
-        return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(idx) / 2];
+        }
+        else
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+        }
    }

    const T& operator()(std::vector<std::size_t> idx) const
    {
-        return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(idx) / 2];
+        }
+        else
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+        }
    }

    typename Data::iterator begin() { return mData.begin(); }

--- a/library/include/ck/library/utility/host_tensor_generator.hpp
+++ b/library/include/ck/library/utility/host_tensor_generator.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -37,7 +37,7 @@ struct GeneratorTensor_1<ck::half_t>
    float value = 1.0;

    template <typename... Is>
-    ck::bhalf_t operator()(Is...)
+    ck::half_t operator()(Is...)
    {
        return ck::type_convert<ck::half_t>(value);
    }
@@ -62,13 +62,25 @@ struct GeneratorTensor_1<ck::f8_t>
    float value = 1.0;

    template <typename... Is>
-    ck::bhalf_t operator()(Is...)
+    ck::f8_t operator()(Is...)
    {
        return ck::type_convert<ck::f8_t>(value);
    }
 };
 #endif

+template <>
+struct GeneratorTensor_1<ck::f4_t>
+{
+    float value = 1.0;
+
+    template <typename... Is>
+    ck::f4_t operator()(Is...)
+    {
+        return ck::type_convert<ck::f4_t>(value);
+    }
+};
+
 template <>
 struct GeneratorTensor_1<int8_t>
 {
@@ -81,6 +93,20 @@ struct GeneratorTensor_1<int8_t>
    }
 };

+template <>
+struct GeneratorTensor_1<ck::pk_i4_t>
+{
+    int8_t value = 1;
+
+    template <typename... Is>
+    ck::pk_i4_t operator()(Is...)
+    {
+        int t         = value + 8;
+        ck::pk_i4_t r = ((t << 4) + t) & 0xff;
+        return r;
+    }
+};
+
 template <typename T>
 struct GeneratorTensor_2
 {
@@ -121,6 +147,22 @@ struct GeneratorTensor_2<int8_t>
    }
 };

+template <>
+struct GeneratorTensor_2<ck::pk_i4_t>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    ck::pk_i4_t operator()(Is...)
+    {
+        int hi        = std::rand() % (max_value - min_value) + min_value + 8;
+        int lo        = std::rand() % (max_value - min_value) + min_value + 8;
+        ck::pk_i4_t r = ((hi << 4) + lo) & 0xff;
+        return r;
+    }
+};
+
 #if defined CK_ENABLE_FP8
 template <>
 struct GeneratorTensor_2<ck::f8_t>
@@ -153,6 +195,20 @@ struct GeneratorTensor_2<ck::bf8_t>
 };
 #endif

+template <>
+struct GeneratorTensor_2<ck::f4_t>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    ck::f4_t operator()(Is...)
+    {
+        float tmp = (std::rand() % (max_value - min_value)) + min_value;
+        return ck::type_convert<ck::f4_t>(tmp);
+    }
+};
+
 template <typename T>
 struct GeneratorTensor_3
 {
@@ -223,6 +279,23 @@ struct GeneratorTensor_3<ck::bf8_t>
 };
 #endif

+template <>
+struct GeneratorTensor_3<ck::f4_t>
+{
+    float min_value = 0;
+    float max_value = 1;
+
+    template <typename... Is>
+    ck::f4_t operator()(Is...)
+    {
+        float tmp = float(std::rand()) / float(RAND_MAX);
+
+        float fp32_tmp = min_value + tmp * (max_value - min_value);
+
+        return ck::type_convert<ck::f4_t>(fp32_tmp);
+    }
+};
+
 template <typename T>
 struct GeneratorTensor_4
 {
@@ -256,14 +329,33 @@ struct GeneratorTensor_Checkboard
    }
 };

-template <ck::index_t Dim>
+/**
+ * @brief Is used to generate sequential values based on the specified dimension.
+ *
+ * @tparam T The type of the tensor values.
+ * @tparam Dim The specific dimension used for generation.
+ *
+ * GeneratorTensor_Sequential<1>{} will generate the following values for a 3x3 tensor:
+ *
+ * 0 1 2
+ * 0 1 2
+ * 0 1 2
+ *
+ * Essentially, the values generated are logical coordinates of the generated element that
+ * correspond to dimension Dim. E.g. for 2-dimensional tensor and Dim=1, the values are the column
+ * indices.
+ *
+ */
+template <typename T, ck::index_t Dim>
 struct GeneratorTensor_Sequential
 {
    template <typename... Ts>
-    float operator()(Ts... Xs) const
+    T operator()(Ts... Xs) const
    {
        std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
-        return dims[Dim];
+
+        float tmp = dims[Dim];
+        return ck::type_convert<T>(tmp);
    }
 };


--- a/library/include/ck/library/utility/iterator.hpp
+++ b/library/include/ck/library/utility/iterator.hpp
--- a/library/include/ck/library/utility/literals.hpp
+++ b/library/include/ck/library/utility/literals.hpp
--- a/library/include/ck/library/utility/numeric.hpp
+++ b/library/include/ck/library/utility/numeric.hpp
--- a/library/include/ck/library/utility/ranges.hpp
+++ b/library/include/ck/library/utility/ranges.hpp
--- a/include/ck/tensor/static_tensor.hpp
+++ b/include/ck/tensor/static_tensor.hpp
@@ -167,7 +167,7 @@ struct StaticTensorTupleOfVectorBuffer
    // Idx is for S, not X. Idx should be aligned with X
    template <typename X,
              typename Idx,
-              typename enable_if<has_same_scalar_type<S, X>::value &&
+              typename enable_if<(has_same_scalar_type<S, X>::value || !is_native_type<S>()) &&
                                     is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                 bool>::type = false>
    __host__ __device__ constexpr X GetAsType(Idx) const
@@ -201,7 +201,7 @@ struct StaticTensorTupleOfVectorBuffer
    // Idx is for S, not X. Idx should be aligned with X
    template <typename X,
              typename Idx,
-              typename enable_if<has_same_scalar_type<S, X>::value &&
+              typename enable_if<(has_same_scalar_type<S, X>::value || !is_native_type<S>()) &&
                                     is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                 bool>::type = false>
    __host__ __device__ constexpr void SetAsType(Idx, X x)

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp"
+
+namespace ck {
+
+enum struct BlockGemmPipelineVersion
+{
+    v1, // Naive
+    v2, // Mem
+    v3, // Comp
+    v4, // Comp, double lds buffer
+    v5, // Comp, double global prefetch register buffer
+};
+
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+constexpr auto BlockGemmPipeline_Selector()
+{
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+    {
+        return BlockwiseGemmXdlops_pipeline_v1_b_scale<BlkGemmPipeSche,
+                                                       BlockSize,
+                                                       ADataType,
+                                                       BDataType,
+                                                       ComputeDataType,
+                                                       AccDataType,
+                                                       ATileDesc,
+                                                       BTileDesc,
+                                                       AMmaTileDesc,
+                                                       BMmaTileDesc,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerXDL,
+                                                       NPerXDL,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       KPack>{};
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+    {
+        return BlockwiseGemmXdlops_pipeline_v2_b_scale<BlkGemmPipeSche,
+                                                       BlockSize,
+                                                       ADataType,
+                                                       BDataType,
+                                                       ComputeDataType,
+                                                       AccDataType,
+                                                       ATileDesc,
+                                                       BTileDesc,
+                                                       AMmaTileDesc,
+                                                       BMmaTileDesc,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerXDL,
+                                                       NPerXDL,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       KPack>{};
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        return BlockwiseGemmXdlops_pipeline_v3_b_scale<BlkGemmPipeSche,
+                                                       BlockSize,
+                                                       ADataType,
+                                                       BDataType,
+                                                       ComputeDataType,
+                                                       AccDataType,
+                                                       ATileDesc,
+                                                       BTileDesc,
+                                                       AMmaTileDesc,
+                                                       BMmaTileDesc,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerXDL,
+                                                       NPerXDL,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       KPack>{};
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+    {
+        return BlockwiseGemmXdlops_pipeline_v4_b_scale<BlkGemmPipeSche,
+                                                       BlockSize,
+                                                       ADataType,
+                                                       BDataType,
+                                                       ComputeDataType,
+                                                       AccDataType,
+                                                       ATileDesc,
+                                                       BTileDesc,
+                                                       AMmaTileDesc,
+                                                       BMmaTileDesc,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerXDL,
+                                                       NPerXDL,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       KPack>{};
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v5)
+    {
+        return BlockwiseGemmXdlops_pipeline_v5<BlkGemmPipeSche,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>{};
+    }
+    else
+    {
+        std::cerr << "BlockGemmPipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 1
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 0
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_v1_b_scale
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_v1_b_scale<BlockGemmPipelineScheduler::Intrawave,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::I0;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    static constexpr index_t PrefetchStages  = 1;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        ignore = num_loop;
+        return TailNumber::Full;
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              // BScale Thread Copy
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // BScaleThreadCopy
+        const BScaleGridDesc& b_scale_grid_desc,
+        const BScaleThreadDesc& b_scale_thread_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+        // num_loop
+        index_t num_loop,
+        index_t num_loop_per_scale) const
+    {
+        // assume kperblock = scaleblockk
+        ignore            = num_loop_per_scale;
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_buf);
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                               b_scale_thread_copy_step.At(Number<1>{}));
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        auto c_thread_buf_per_scale = remove_cvref_t<decltype(c_thread_buf)>();
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                // -------------------------------------------------------------------------------------------
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                block_sync_lds();
+                static_for<0, KRepeat, 1>{}([&](auto k) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k, I0),
+                                           a_thread_buf);
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k, I0),
+                                           b_thread_buf);
+                    });
+                });
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        c_thread_buf_per_scale.Clear();
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(I0));
+                        });
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                            c_thread_buf(Number<c_offset>{}) +=
+                                c_thread_buf_per_scale[Number<t>{}] *
+                                type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                        });
+                    });
+                });
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, I0),
+                                            b_scale_thread_buf);
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{}));
+                });
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       b_scale_thread_copy_step.At(Number<1>{}));
+
+                block_sync_lds();
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                i += 1;
+
+            } while(i < (num_loop - 1));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, I0),
+                                       a_thread_buf);
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_buf);
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    c_thread_buf_per_scale.Clear();
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(I0));
+                    });
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                        c_thread_buf(Number<c_offset>{}) +=
+                            c_thread_buf_per_scale[Number<t>{}] *
+                            type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
@@ -269,15 +269,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                               a_thread_desc_,
                                               make_tuple(m0, I0, k, I0),
                                               a_thread_buf);
-                            static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                b_thread_copy_.Run(
-                                    b_block_desc_n0_n1_n2_k,
-                                    make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                    b_block_buf,
-                                    b_thread_desc_,
-                                    make_tuple(n0, I0, k, I0),
-                                    b_thread_buf);
-                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k, I0),
+                                               b_thread_buf);
                        });
                    });

@@ -341,14 +340,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                           a_thread_desc_,
                                           make_tuple(m0, I0, k, I0),
                                           a_thread_buf);
-                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                               b_block_buf,
-                                               b_thread_desc_,
-                                               make_tuple(n0, I0, k, I0),
-                                               b_thread_buf);
-                        });
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k, I0),
+                                           b_thread_buf);
                    });
                });

@@ -396,14 +395,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k, I0),
                                       a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_block_buf,
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k, I0),
-                                           b_thread_buf);
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_buf);
                });
            });

@@ -447,14 +446,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k, I0),
                                       a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_block_buf,
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k, I0),
-                                           b_thread_buf);
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_buf);
                });
            });

@@ -760,15 +759,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                               a_thread_desc_,
                                               make_tuple(m0, I0, k0, I0),
                                               a_thread_buf);
-                            static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                b_thread_copy_.Run(
-                                    b_block_desc_n0_n1_n2_k,
-                                    make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                    b_block_buf,
-                                    b_thread_desc_,
-                                    make_tuple(n0, I0, k0, I0),
-                                    b_thread_buf);
-                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k0, I0),
+                                               b_thread_buf);
                        });
                        __builtin_amdgcn_sched_barrier(0);
                        // NOTE: Synchronize threads in a workgroup at the start of each MAC
@@ -866,14 +864,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                           a_thread_desc_,
                                           make_tuple(m0, I0, k0, I0),
                                           a_thread_buf);
-                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                               make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                               b_block_buf,
-                                               b_thread_desc_,
-                                               make_tuple(n0, I0, k0, I0),
-                                               b_thread_buf);
-                        });
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k0, I0),
+                                           b_thread_buf);
                    });

                    __builtin_amdgcn_sched_barrier(0);
@@ -942,14 +940,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k0, I0),
                                       a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                           b_block_buf,
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k0, I0),
-                                           b_thread_buf);
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k0, I0),
+                                       b_thread_buf);
                });

                __builtin_amdgcn_sched_barrier(0);
@@ -1018,14 +1016,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k0, I0),
                                       a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                           b_block_buf,
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k0, I0),
-                                           b_thread_buf);
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k0, I0),
+                                       b_thread_buf);
                });

                __builtin_amdgcn_sched_barrier(0);

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Maximum Global Memory throughput pipeline with >=32KB data in fly
+// GlobalPrefetchStages: >=2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 0
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_v2_b_scale
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Intrawave,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::I0;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    static constexpr index_t WgpPerCU =
+        (4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
+    static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
+        32768 / WgpPerCU,
+        (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
+    static constexpr index_t PrefetchStages =
+        FullMemBandPrefetchStages >= 2
+            ? FullMemBandPrefetchStages <= 8 ? FullMemBandPrefetchStages : 8
+            : 2;
+
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = PrefetchStages;
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        if(num_loop % PrefetchStages == 1)
+        {
+            return TailNumber::One;
+        }
+        else if(num_loop % PrefetchStages == 2)
+        {
+            return TailNumber::Two;
+        }
+        else if(num_loop % PrefetchStages == 3)
+        {
+            return TailNumber::Three;
+        }
+        else if(num_loop % PrefetchStages == 4)
+        {
+            return TailNumber::Four;
+        }
+        else if(num_loop % PrefetchStages == 5)
+        {
+            return TailNumber::Five;
+        }
+        else if(num_loop % PrefetchStages == 6)
+        {
+            return TailNumber::Six;
+        }
+        else if(num_loop % PrefetchStages == 7)
+        {
+            return TailNumber::Seven;
+        }
+        else
+        {
+            return TailNumber::Full;
+        }
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
+
+        // Global prefetch [2, PrefetchStages]
+        static_for<1, PrefetchStages, 1>{}([&](auto iprefetch) {
+            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
+            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
+
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        });
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                static_for<0, PrefetchStages, 1>{}([&](auto iprefetch) {
+                    // -------------------------------------------------------------------------------------------
+                    block_sync_lds();
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                               make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, k, I0),
+                                               a_thread_buf);
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                b_thread_copy_.Run(
+                                    b_block_desc_n0_n1_n2_k,
+                                    make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                    b_block_buf,
+                                    b_thread_desc_,
+                                    make_tuple(n0, I0, k, I0),
+                                    b_thread_buf);
+                            });
+                        });
+                    });
+
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, k0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                            make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(
+                        a_block_desc, a_block_buf, Number<(iprefetch + 1) % PrefetchStages>{});
+                    b_blockwise_copy.RunWrite(
+                        b_block_desc, b_block_buf, Number<(iprefetch + 1) % PrefetchStages>{});
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                });
+
+                i += PrefetchStages;
+            } while(i < (num_loop - PrefetchStages));
+        }
+
+        // tail
+
+        auto LoopTailFunc = [&](auto tail_num) {
+            static_for<1, tail_num, 1>{}([&](auto iprefetch) {
+                block_sync_lds();
+                static_for<0, KRepeat, 1>{}([&](auto k) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k, I0),
+                                           a_thread_buf);
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k, I0),
+                                               b_thread_buf);
+                        });
+                    });
+                });
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            xdlops_gemm.Run(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        });
+                    });
+                });
+
+                block_sync_lds();
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, iprefetch);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, iprefetch);
+            });
+
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, I0),
+                                       a_thread_buf);
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k, I0),
+                                           b_thread_buf);
+                    });
+                });
+            });
+
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        };
+
+        if constexpr(TailNum == TailNumber::One)
+        {
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, I0),
+                                       a_thread_buf);
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k, I0),
+                                           b_thread_buf);
+                    });
+                });
+            });
+
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Two)
+        {
+            LoopTailFunc(Number<2>{});
+        }
+        else if constexpr(TailNum == TailNumber::Three)
+        {
+            LoopTailFunc(Number<3>{});
+        }
+        else if constexpr(TailNum == TailNumber::Four)
+        {
+            LoopTailFunc(Number<4>{});
+        }
+        else if constexpr(TailNum == TailNumber::Five)
+        {
+            LoopTailFunc(Number<5>{});
+        }
+        else if constexpr(TailNum == TailNumber::Six)
+        {
+            LoopTailFunc(Number<6>{});
+        }
+        else if constexpr(TailNum == TailNumber::Seven)
+        {
+            LoopTailFunc(Number<7>{});
+        }
+        else if constexpr(TailNum == TailNumber::Full)
+        {
+            LoopTailFunc(Number<PrefetchStages>{});
+        }
+    }
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Interwave,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KPerThread;
+    using Base::xdlops_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    static constexpr index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS;
+    static constexpr index_t KPerInnerLoop  = math::max(KPerThread / NumMacClusters, KPack);
+    static constexpr index_t KRepeat        = KPerThread / KPerInnerLoop;
+
+    static constexpr index_t WgpPerCU =
+        (4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
+    static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
+        32768 / WgpPerCU,
+        (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
+    static constexpr index_t PrefetchStages =
+        FullMemBandPrefetchStages >= 2
+            ? FullMemBandPrefetchStages <= 8 ? FullMemBandPrefetchStages : 8
+            : 2;
+
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = PrefetchStages;
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        if(num_loop % PrefetchStages == 1)
+        {
+            return TailNumber::One;
+        }
+        else if(num_loop % PrefetchStages == 2)
+        {
+            return TailNumber::Two;
+        }
+        else if(num_loop % PrefetchStages == 3)
+        {
+            return TailNumber::Three;
+        }
+        else if(num_loop % PrefetchStages == 4)
+        {
+            return TailNumber::Four;
+        }
+        else if(num_loop % PrefetchStages == 5)
+        {
+            return TailNumber::Five;
+        }
+        else if(num_loop % PrefetchStages == 6)
+        {
+            return TailNumber::Six;
+        }
+        else if(num_loop % PrefetchStages == 7)
+        {
+            return TailNumber::Seven;
+        }
+        else
+        {
+            return TailNumber::Full;
+        }
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        const BScaleGridDesc& b_scale_grid_desc,
+                        // BScaleThreadCopy
+                        const BScaleThreadDesc& b_scale_thread_desc,
+                        BScaleThreadTransfer& b_scale_thread_copy,
+                        const BScaleGridBuffer& b_scale_grid_buf,
+                        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+                        // num loop
+                        index_t num_loop,
+                        index_t num_loop_per_scale) const
+    {
+        ignore = num_loop_per_scale;
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_buf);
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                               b_scale_thread_copy_step.At(Number<1>{}));
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
+
+        // Global prefetch [2, PrefetchStages]
+        static_for<1, PrefetchStages, 1>{}([&](auto iprefetch) {
+            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
+            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
+
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        });
+
+        auto c_thread_buf_per_scale = remove_cvref_t<decltype(c_thread_buf)>(); // need?
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                static_for<0, PrefetchStages, 1>{}([&](auto iprefetch) {
+                    // -------------------------------------------------------------------------------------------
+                    block_sync_lds();
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                               make_tuple(m0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, k0, I0),
+                                               a_thread_buf);
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                b_thread_copy_.Run(
+                                    b_block_desc_n0_n1_n2_k,
+                                    make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                    b_block_buf,
+                                    b_thread_desc_,
+                                    make_tuple(n0, I0, k0, I0),
+                                    b_thread_buf);
+                            });
+                        });
+                        __builtin_amdgcn_sched_barrier(0);
+                        // NOTE: Synchronize threads in a workgroup at the start of each MAC
+                        // cluster, but except the first, as we can shorten non-MAC cluster a bit
+                        // and there's no observable negative impact. The desired effect is waves in
+                        // a workgroup executing MAC in sync. This avoids some out-of-sync waves
+                        // hijacking MAC resource from other workgroups and reducing the chance of
+                        // latency hiding by waiting for the rest of the workgroup at the eventual
+                        // sync point.
+                        if constexpr(k0.value != 0 || KRepeat == 1)
+                        {
+                            __builtin_amdgcn_s_barrier();
+                            __builtin_amdgcn_sched_barrier(0);
+                        }
+                        static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+                            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                    vector_type<ComputeDataType, KPack> a_thread_vec;
+                                    vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, k0, k_ + ik))>{}];
+                                        b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, k0, k_ + ik))>{}];
+                                    });
+
+                                    using mfma_input_type =
+                                        typename vector_type<ComputeDataType,
+                                                             xdlops_gemm.K1PerXdlops>::type;
+
+                                    constexpr index_t c_offset =
+                                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                    // The block_sync_lds() here performs double duty:
+                                    // A) safeguard against data hazard because barrier from
+                                    // blockwise_gemm is moved here B) reduce VMEM FIFO congestion
+                                    // by applying small delays to different wavefronts It is
+                                    // performed near the end of MAC cluster to minimize lgkmcnt
+                                    // penalty
+                                    if constexpr(k0.value == KRepeat - 1 &&
+                                                 k_.value == KPerInnerLoop - KPack &&
+                                                 m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
+                                    {
+                                        __builtin_amdgcn_sched_barrier(0);
+                                        block_sync_lds();
+                                        __builtin_amdgcn_sched_barrier(0);
+                                    }
+                                    xdlops_gemm.Run(
+                                        a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                    if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
+                                    {
+                                        __builtin_amdgcn_sched_barrier(0);
+                                        __builtin_amdgcn_s_setprio(1);
+                                        __builtin_amdgcn_sched_barrier(0);
+                                    }
+                                });
+
+                                // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t)
+                                // {
+                                //     constexpr index_t c_offset =
+                                //         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                                //     c_thread_buf(Number<c_offset>{}) +=
+                                //         c_thread_buf_per_scale[Number<t>{}] *
+                                //         type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                                // });
+                            });
+                        });
+                        __builtin_amdgcn_sched_barrier(0);
+                        __builtin_amdgcn_s_setprio(0);
+                        __builtin_amdgcn_sched_barrier(0);
+                    });
+
+                    // static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    //         b_scale_thread_copy.Run(b_scale_grid_desc,
+                    //                                 b_scale_grid_buf,
+                    //                                 b_scale_thread_desc,
+                    //                                 make_tuple(n0, I0),
+                    //                                 b_scale_thread_buf);
+
+                    //         b_scale_thread_copy.MoveSrcSliceWindow(
+                    //         b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{}));
+                    //     });
+                    // b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                    //                                    b_scale_thread_copy_step.At(Number<1>{}));
+
+                    // block_sync_lds();
+                    a_blockwise_copy.RunWrite(
+                        a_block_desc, a_block_buf, Number<(iprefetch + 1) % PrefetchStages>{});
+                    b_blockwise_copy.RunWrite(
+                        b_block_desc, b_block_buf, Number<(iprefetch + 1) % PrefetchStages>{});
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                });
+                i += PrefetchStages;
+            } while(i < (num_loop - PrefetchStages));
+        }
+
+        // tail
+
+        auto LoopTailFunc = [&](auto tail_num) {
+            static_for<1, tail_num, 1>{}([&](auto iprefetch) {
+                block_sync_lds();
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k0, I0),
+                                           a_thread_buf);
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k0, I0),
+                                               b_thread_buf);
+                        });
+                    });
+
+                    __builtin_amdgcn_sched_barrier(0);
+                    if constexpr(k0.value != 0 || KRepeat == 1)
+                    {
+                        __builtin_amdgcn_s_barrier();
+                        __builtin_amdgcn_sched_barrier(0);
+                    }
+                    static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, k0, k_ + ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                            make_tuple(n0, I0, k0, k_ + ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                if constexpr(k0.value == KRepeat - 1 &&
+                                             k_.value == KPerInnerLoop - KPack &&
+                                             m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
+                                {
+                                    __builtin_amdgcn_sched_barrier(0);
+                                    block_sync_lds();
+                                    __builtin_amdgcn_sched_barrier(0);
+                                }
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
+                                {
+                                    __builtin_amdgcn_sched_barrier(0);
+                                    __builtin_amdgcn_s_setprio(1);
+                                    __builtin_amdgcn_sched_barrier(0);
+                                }
+                            });
+
+                            // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            //     constexpr index_t c_offset =
+                            //         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                            //     c_thread_buf(Number<c_offset>{}) +=
+                            //         c_thread_buf_per_scale[Number<t>{}] *
+                            //         type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                            // });
+                        });
+                    });
+                    __builtin_amdgcn_sched_barrier(0);
+                    __builtin_amdgcn_s_setprio(0);
+                    __builtin_amdgcn_sched_barrier(0);
+                });
+
+                // static_for<0, NRepeat, 1>{}([&](auto n0) {
+                //     b_scale_thread_copy.Run(b_scale_grid_desc,
+                //                             b_scale_grid_buf,
+                //                             b_scale_thread_desc,
+                //                             make_tuple(n0, I0),
+                //                             b_scale_thread_buf);
+
+                //     b_scale_thread_copy.MoveSrcSliceWindow(
+                //         b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{}));
+                // });
+                // b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                //                                        b_scale_thread_copy_step.At(Number<1>{}));
+
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, iprefetch);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, iprefetch);
+            });
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k0, I0),
+                                       a_thread_buf);
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k0, I0),
+                                           b_thread_buf);
+                    });
+                });
+
+                __builtin_amdgcn_sched_barrier(0);
+                if constexpr(k0.value != 0 || KRepeat == 1)
+                {
+                    __builtin_amdgcn_s_barrier();
+                    __builtin_amdgcn_sched_barrier(0);
+                }
+                static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, k_ + ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, k_ + ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            if constexpr(k0.value == KRepeat - 1 &&
+                                         k_.value == KPerInnerLoop - KPack &&
+                                         m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                block_sync_lds();
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                            xdlops_gemm.Run(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                __builtin_amdgcn_s_setprio(1);
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                        });
+
+                        // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        //     constexpr index_t c_offset =
+                        //         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                        //     c_thread_buf(Number<c_offset>{}) +=
+                        //         c_thread_buf_per_scale[Number<t>{}] *
+                        //         type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                        // });
+                    });
+                });
+                __builtin_amdgcn_sched_barrier(0);
+                __builtin_amdgcn_s_setprio(0);
+                __builtin_amdgcn_sched_barrier(0);
+            });
+        };
+
+        if constexpr(TailNum == TailNumber::One)
+        {
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k0, I0),
+                                       a_thread_buf);
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k0, I0),
+                                           b_thread_buf);
+                    });
+                });
+
+                __builtin_amdgcn_sched_barrier(0);
+                if constexpr(k0.value != 0 || KRepeat == 1)
+                {
+                    __builtin_amdgcn_s_barrier();
+                    __builtin_amdgcn_sched_barrier(0);
+                }
+                static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, k_ + ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, k_ + ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            if constexpr(k0.value == KRepeat - 1 &&
+                                         k_.value == KPerInnerLoop - KPack &&
+                                         m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                block_sync_lds();
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                            xdlops_gemm.Run(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                __builtin_amdgcn_s_setprio(1);
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                        });
+
+                        // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        //     constexpr index_t c_offset =
+                        //         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                        //     c_thread_buf(Number<c_offset>{}) +=
+                        //         c_thread_buf_per_scale[Number<t>{}] *
+                        //         type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                        // });
+                    });
+                });
+                __builtin_amdgcn_sched_barrier(0);
+                __builtin_amdgcn_s_setprio(0);
+                __builtin_amdgcn_sched_barrier(0);
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Two)
+        {
+            LoopTailFunc(Number<2>{});
+        }
+        else if constexpr(TailNum == TailNumber::Three)
+        {
+            LoopTailFunc(Number<3>{});
+        }
+        else if constexpr(TailNum == TailNumber::Four)
+        {
+            LoopTailFunc(Number<4>{});
+        }
+        else if constexpr(TailNum == TailNumber::Five)
+        {
+            LoopTailFunc(Number<5>{});
+        }
+        else if constexpr(TailNum == TailNumber::Six)
+        {
+            LoopTailFunc(Number<6>{});
+        }
+        else if constexpr(TailNum == TailNumber::Seven)
+        {
+            LoopTailFunc(Number<7>{});
+        }
+        else if constexpr(TailNum == TailNumber::Full)
+        {
+            LoopTailFunc(Number<PrefetchStages>{});
+        }
+    }
+
+    protected:
+    // K->M loopover
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor(
+        make_tuple(Number<MRepeat>{}, I1, Number<KRepeat>{}, Number<KPerInnerLoop>{}),
+        make_tuple(Number<KPerInnerLoop>{},
+                   Number<KRepeat * MRepeat * KPerInnerLoop>{},
+                   Number<MRepeat * KPerInnerLoop>{},
+                   I1));
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPerInnerLoop>{}),
+        make_tuple(Number<KPerInnerLoop>{},
+                   Number<KRepeat * NRepeat * KPerInnerLoop>{},
+                   Number<NRepeat * KPerInnerLoop>{},
+                   I1));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerInnerLoop>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
+                                                         ComputeDataType,
+                                                         decltype(b_block_desc_n0_n1_n2_k),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerInnerLoop>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex()};
+    BThreadCopy b_thread_copy_{Base::CalculateBThreadOriginDataIndex()};
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_v3_b_scale
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_v3_b_scale<BlockGemmPipelineScheduler::Intrawave,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        ignore = num_loop;
+        return TailNumber::Full;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+
+        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_mfma =
+            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+        constexpr auto num_dsread_b_mfma =
+            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+
+        // stage 1
+        // Separate this part?
+        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
+        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
+        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
+        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
+        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
+        constexpr auto num_mfma_per_issue =
+            num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
+        constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
+        constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA
+        });
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                         ds_read_a_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
+                                                                              ds_read_a_mfma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+
+        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
+                         ds_read_b_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
+                                                                              ds_read_b_mfma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        // BScaleThreadCopy
+                        const BScaleGridDesc& b_scale_grid_desc,
+                        const BScaleThreadDesc& b_scale_thread_desc,
+                        BScaleThreadTransfer& b_scale_thread_copy,
+                        const BScaleGridBuffer& b_scale_grid_buf,
+                        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+                        // num loop
+                        index_t num_loop,
+                        index_t num_loop_per_scale) const
+    {
+        __builtin_amdgcn_sched_barrier(0);
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        // B scale buffer
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_buf);
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if(num_loop_per_scale == 1)
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        constexpr auto num_scale_k_block = BScaleThreadDesc{}.GetLength(I1);
+        constexpr auto num_scale_krepeat = KRepeat / num_scale_k_block;
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Global prefetch 2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // Local prefetch 1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k0) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                   make_tuple(m0, I0, I0, Number<k0 * AMmaKStride>{}),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, k0, I0),
+                                   a_thread_buf);
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                b_thread_copy_.Run(
+                    b_block_desc_n0_n1_n2_k,
+                    make_tuple(n0, I0, I0, Number<k0 * BMmaKStride>{}),
+                    b_block_buf,
+                    b_scale_thread_buf[Number<n0 * num_scale_k_block + k0 / num_scale_krepeat>{}],
+                    b_thread_desc_,
+                    make_tuple(n0, I0, k0, I0),
+                    b_thread_buf);
+            });
+        });
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                block_sync_lds();
+
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, I0),
+                                            b_scale_thread_buf);
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{}));
+                });
+
+                if((i + 2) % num_loop_per_scale == 0)
+                {
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, b_scale_thread_copy_step.At(Number<2>{}));
+                }
+                else
+                {
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, b_scale_thread_copy_step.At(Number<1>{}));
+                }
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            xdlops_gemm.Run(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        });
+                    });
+                });
+
+                block_sync_lds();
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<k0 * AMmaKStride>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k0, I0),
+                                           a_thread_buf);
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k0 * BMmaKStride>{}),
+                                           b_block_buf,
+                                           b_scale_thread_buf[Number<n0 * num_scale_k_block +
+                                                                     k0 / num_scale_krepeat>{}],
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k0, I0),
+                                           b_thread_buf);
+                    });
+                });
+
+                HotLoopScheduler();
+                __builtin_amdgcn_sched_barrier(0);
+
+                i += 1;
+            } while(i < (num_loop - 1));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+            __builtin_amdgcn_sched_barrier(0);
+        }
+    }
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
@@ -305,14 +305,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                   a_thread_desc_,
                                   make_tuple(m0, I0, k, I0),
                                   a_thread_bufs(I0));
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                       b_block_buf.At(I0),
-                                       b_thread_desc_,
-                                       make_tuple(n0, I0, k, I0),
-                                       b_thread_bufs(I0));
-                });
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                   make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                   b_block_buf.At(I0),
+                                   b_thread_desc_,
+                                   make_tuple(n0, I0, k, I0),
+                                   b_thread_bufs(I0));
            });
        });

@@ -356,15 +356,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                               a_thread_desc_,
                                               make_tuple(m0, I0, k, I0),
                                               a_thread_bufs(lds_read_reg_buf));
-                            static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                b_thread_copy_.Run(
-                                    b_block_desc_n0_n1_n2_k,
-                                    make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                    b_block_buf.At(lds_read_buf),
-                                    b_thread_desc_,
-                                    make_tuple(n0, I0, k, I0),
-                                    b_thread_bufs(lds_read_reg_buf));
-                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                               b_block_buf.At(lds_read_buf),
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k, I0),
+                                               b_thread_bufs(lds_read_reg_buf));
                        });
                    });

@@ -437,14 +436,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k, I0),
                                       a_thread_bufs(lds_read_reg_buf));
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_block_buf.At(lds_read_buf),
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k, I0),
-                                           b_thread_bufs(lds_read_reg_buf));
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf.At(lds_read_buf),
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_bufs(lds_read_reg_buf));
                });
            });

@@ -496,14 +495,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k, I0),
                                       a_thread_bufs(lds_read_reg_buf));
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_block_buf.At(lds_read_buf),
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k, I0),
-                                           b_thread_bufs(lds_read_reg_buf));
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf.At(lds_read_buf),
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_bufs(lds_read_reg_buf));
                });
            });


--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimimal pipeline with highest resource request
+// GlobalPrefetchStages: 4
+// LocalPreFillStages: 2
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 2
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_v4_b_scale
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_v4_b_scale<BlockGemmPipelineScheduler::Intrawave,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    static constexpr index_t PrefetchStages  = 3;
+    static constexpr index_t PrefillStages   = 2;
+    static constexpr index_t GlobalBufferNum = 1;
+    static constexpr index_t HotloopUnroll   = 2;
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        if(num_loop % HotloopUnroll == 1)
+        {
+            return TailNumber::Odd;
+        }
+        else
+        {
+            return TailNumber::Even;
+        }
+    }
+
+    __device__ static constexpr void HotLoopScheduler()
+    {
+        // TODO: Take data type into consideration as pipe ver 3
+        // A-B splited schedule
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_issue_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_dswrite_per_issue_a =
+            (HotLoopInstList::A_LDS_Write_Inst_Num + num_issue_a - 1) / num_issue_a;
+        constexpr auto num_dsread_per_issue_a = num_ds_read_inst_a / num_issue_a;
+
+        constexpr auto num_issue_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+        constexpr auto num_dswrite_per_issue_b =
+            (HotLoopInstList::B_LDS_Write_Inst_Num + num_issue_b - 1) / num_issue_b;
+        constexpr auto num_dsread_per_issue_b = num_ds_read_inst_b / num_issue_b;
+
+        constexpr auto num_mfma_per_issue =
+            HotLoopInstList::C_MFMA_Inst_Num / (num_issue_a + num_issue_b);
+
+        static_for<0, num_issue_a, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dsread_per_issue_a, 1>{}([&](auto idsread) {
+                ignore = idsread;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+
+            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008,
+                                                 num_mfma_per_issue - num_dsread_per_issue_a -
+                                                     num_dswrite_per_issue_a,
+                                                 0); // MFMA
+        });
+
+        static_for<0, num_issue_b, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dsread_per_issue_b, 1>{}([&](auto idsread) {
+                ignore = idsread;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+
+            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008,
+                                                 num_mfma_per_issue - num_dsread_per_issue_a -
+                                                     num_dswrite_per_issue_b,
+                                                 0); // MFMA
+        });
+        __builtin_amdgcn_sched_barrier(0);
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        // BScaleThreadCopy
+                        const BScaleGridDesc& b_scale_grid_desc,
+                        const BScaleThreadDesc& b_scale_thread_desc,
+                        BScaleThreadTransfer& b_scale_thread_copy,
+                        const BScaleGridBuffer& b_scale_grid_buf,
+                        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+                        // num loop
+                        index_t num_loop,
+                        index_t num_loop_per_scale) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        // B scale buffer
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_thread_buf), Number<2>{}> a_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_bufs(I0));
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if(num_loop_per_scale == 1)
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I0));
+
+        // Global prefetch 2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_bufs(I1));
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if(2 % num_loop_per_scale == 0)
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        // Local prefetch 1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                   make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                   a_block_buf.At(I0),
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, k, I0),
+                                   a_thread_bufs(I0));
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf.At(I0),
+                                       b_scale_thread_bufs(I0)[n0],
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_bufs(I0));
+                });
+            });
+        });
+
+        // Local prefill 2
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I1));
+
+        // Global prefetch 3
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_bufs(I0));
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if(3 % num_loop_per_scale == 0)
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            // This hot loop has two legacy loopover, to implement the double local buffer strategy
+            do
+            {
+                auto LoopFunc = [&](auto lds_read_buf,
+                                    auto lds_read_reg_buf,
+                                    auto lds_write_buf,
+                                    auto mfma_reg_buf) {
+                    block_sync_lds();
+
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                               make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                               a_block_buf.At(lds_read_buf),
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, k, I0),
+                                               a_thread_bufs(lds_read_reg_buf));
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                               b_block_buf.At(lds_read_buf),
+                                               b_scale_thread_bufs(lds_read_buf)[n0],
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k, I0),
+                                               b_thread_bufs(lds_read_reg_buf));
+                        });
+                    });
+
+                    // B scale copy
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                b_scale_grid_buf,
+                                                b_scale_thread_desc,
+                                                make_tuple(n0, I0),
+                                                b_scale_thread_bufs(lds_read_reg_buf));
+
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{}));
+                    });
+
+                    if((i + 4 + mfma_reg_buf.value) % num_loop_per_scale == 0)
+                    {
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, b_scale_thread_copy_step.At(Number<2>{}));
+                    }
+                    else
+                    {
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, b_scale_thread_copy_step.At(Number<1>{}));
+                    }
+
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(lds_write_buf));
+                    b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(lds_write_buf));
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_bufs[mfma_reg_buf]
+                                                     [Number<a_thread_desc_.CalculateOffset(
+                                                         make_tuple(m0, I0, k0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                };
+
+                LoopFunc(I1, I1, I0, I0);
+                LoopFunc(I0, I0, I1, I1);
+
+                i += HotloopUnroll;
+            } while(i < (num_loop - PrefetchStages));
+        }
+
+        auto ReadWriteCompFunc = [&](auto lds_read_buf,
+                                     auto lds_read_reg_buf,
+                                     auto lds_write_buf,
+                                     auto mfma_reg_buf) {
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                       a_block_buf.At(lds_read_buf),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, I0),
+                                       a_thread_bufs(lds_read_reg_buf));
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf.At(lds_read_buf),
+                                       b_scale_thread_bufs(lds_read_buf)[n0],
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_bufs(lds_read_reg_buf));
+                });
+            });
+
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(lds_write_buf));
+            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(lds_write_buf));
+
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            HotLoopScheduler();
+        };
+
+        auto ReadCompFunc = [&](auto lds_read_buf, auto lds_read_reg_buf, auto mfma_reg_buf) {
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                       a_block_buf.At(lds_read_buf),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, I0),
+                                       a_thread_bufs(lds_read_reg_buf));
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf.At(lds_read_buf),
+                                       b_scale_thread_bufs(lds_read_buf)[n0],
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_bufs(lds_read_reg_buf));
+                });
+            });
+
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            HotLoopScheduler();
+        };
+
+        auto CompFunc = [&](auto mfma_reg_buf) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        };
+
+        // tail
+        if constexpr(TailNum == TailNumber::Odd)
+        {
+            ReadWriteCompFunc(I1, I1, I0, I0);
+            ReadCompFunc(I0, I0, I1);
+            CompFunc(I0);
+        }
+        else if constexpr(TailNum == TailNumber::Even)
+        {
+            ReadCompFunc(I1, I1, I0);
+            CompFunc(I1);
+        }
+    }
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -131,7 +131,7 @@ struct ThreadGroupTensorSliceTransfer_v7r2
    }

    template <typename T>
-    using is_tuple = decltype(std::declval<T&>().IsTuple());
+    using is_tuple = decltype(ck::declval<T&>().IsTuple());

    template <typename DstBuffers, index_t ThreadScratchId = 0>
    __device__ void RunWrite(const DstDescs& dst_descs,

--- a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

+#ifndef CK_CODE_GEN_RTC
 #include <string>
+#endif

 namespace ck {
 namespace tensor_operation {
@@ -18,6 +20,7 @@ enum struct ConvolutionForwardSpecialization
    Filter3x3,
 };

+#ifndef CK_CODE_GEN_RTC
 inline std::string getConvForwardSpecializationString(const ConvolutionForwardSpecialization& s)
 {
    switch(s)
@@ -30,6 +33,7 @@ inline std::string getConvForwardSpecializationString(const ConvolutionForwardSp
    default: return "Unrecognized specialization!";
    }
 }
+#endif

 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

+#ifndef CK_CODE_GEN_RTC
 #include <string>
 #include <sstream>
-
+#include <regex>
+#include <optional>
 #include "ck/stream_config.hpp"
+#endif

 namespace ck {
 namespace tensor_operation {
 namespace device {

+#ifndef CK_CODE_GEN_RTC
+#define GET_OBJECT_NAME_IMLP                                                  \
+    std::optional<std::string> GetObjectName() const override                 \
+    {                                                                         \
+        std::string str = __PRETTY_FUNCTION__;                                \
+        static std::regex obj_name_expr{"<std::string> (.*)::GetObjectName"}; \
+        std::smatch match;                                                    \
+        if(!std::regex_search(str, match, obj_name_expr))                     \
+        {                                                                     \
+            return str;                                                       \
+        }                                                                     \
+        return std::string(match[1]) + ';';                                   \
+    }
+
+#define GET_TEMPLATE_INFO_IMPL                                  \
+    std::optional<std::string> GetTemplateInfo() const override \
+    {                                                           \
+        std::string str = __PRETTY_FUNCTION__;                  \
+        static std::regex template_expr{"\\[(.*)\\]"};          \
+        std::smatch match;                                      \
+        if(!std::regex_search(str, match, template_expr))       \
+        {                                                       \
+            return std::nullopt;                                \
+        }                                                       \
+        return std::string(match[1]);                           \
+    }
+
+#define REGISTER_EXTRA_PRINTING_METHODS GET_OBJECT_NAME_IMLP GET_TEMPLATE_INFO_IMPL
+#endif
+
+#ifndef CK_CODE_GEN_RTC
 struct BaseArgument
 {
    BaseArgument()                    = default;
@@ -36,18 +70,23 @@ struct BaseInvoker

    virtual ~BaseInvoker() {}
 };
+#endif

 struct BaseOperator
 {
    BaseOperator()                    = default;
    BaseOperator(const BaseOperator&) = default;
    BaseOperator& operator=(const BaseOperator&) = default;
-
+#ifndef CK_CODE_GEN_RTC
    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
    virtual std::string GetTypeString() const { return ""; }

    virtual std::string GetTypeIdName() const { return typeid(*this).name(); }

+    virtual std::optional<std::string> GetObjectName() const { return std::nullopt; }
+
+    virtual std::optional<std::string> GetTemplateInfo() const { return std::nullopt; }
+
    virtual std::string GetTypeIdHashCode() const
    {
        std::ostringstream oss;
@@ -66,7 +105,7 @@ struct BaseOperator
        assert(p_arg);
        p_arg->p_workspace_ = p_workspace;
    }
-
+#endif
    virtual ~BaseOperator() {}
 };