merge develop

ead5167a · dummycoderfe · da1a2829 · 03c6448b · ead5167a · ead5167a
Commit ead5167a authored Nov 01, 2024 by dummycoderfe
20 changed files
--- a/include/ck_tile/core/numeric/math.hpp
+++ b/include/ck_tile/core/numeric/math.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck_tile/core/numeric/type_convert.hpp
+++ b/include/ck_tile/core/numeric/type_convert.hpp
@@ -10,6 +10,7 @@
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/bfloat16.hpp"
 #include "ck_tile/core/numeric/float8.hpp"
+#include "ck_tile/core/numeric/int8.hpp"

 namespace ck_tile {

@@ -60,6 +61,9 @@ CK_TILE_TYPE_CONVERT(bf16_t, bf16, float, float)
 CK_TILE_TYPE_CONVERT(fp8_t, fp8, float, float)
 CK_TILE_TYPE_CONVERT(bf8_t, bf8, float, float)

+CK_TILE_TYPE_CONVERT(float, float, int8_t, int8)
+CK_TILE_TYPE_CONVERT(int8_t, int8, float, float)
+
 #undef CK_TILE_TYPE_CONVERT
 #endif


--- a/include/ck_tile/core/tensor/null_tile_window.hpp
+++ b/include/ck_tile/core/tensor/null_tile_window.hpp
@@ -80,6 +80,13 @@ CK_TILE_DEVICE constexpr auto make_tile_window(null_tensor_view,
    return null_tile_window<remove_cvref_t<WindowLengths>>{window_lengths};
 }

+template <typename WindowLengths, typename StaticTileDistribution>
+CK_TILE_DEVICE constexpr auto make_tile_window(const null_tile_window<WindowLengths>& t,
+                                               const StaticTileDistribution&)
+{
+    return t;
+}
+
 template <typename WindowLengths>
 CK_TILE_DEVICE void
 move_tile_window(null_tile_window<WindowLengths>&,

--- a/include/ck_tile/host/reference/reference_elementwise.hpp
+++ b/include/ck_tile/host/reference/reference_elementwise.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
@@ -8,20 +8,44 @@

 namespace ck_tile {

+// Note: for simplicity, each functor only care about single M
+struct reference_layernorm2d_default_epilogue
+{
+    template <typename OutDataType, typename AccDataType>
+    void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
+    {
+        const int N = acc.mDesc.get_lengths()[1];
+        for(int n = 0; n < N; ++n)
+        {
+            o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
+        }
+    }
+
+    template <typename OutDataType, typename AccDataType>
+    auto operator()(int m, const HostTensor<AccDataType>& acc)
+    {
+        HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
+        operator()(m, o, acc);
+        return o;
+    }
+};
+
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename ComputeDataType,
          typename YDataType,
          typename MeanDataType,
-          typename InvStdDataType>
+          typename InvStdDataType,
+          typename Epilogue = reference_layernorm2d_default_epilogue>
 void reference_layernorm2d_fwd(const HostTensor<XDataType>& x_m_n,
                               const HostTensor<GammaDataType>& gamma_n,
                               const HostTensor<BetaDataType>& beta_n,
                               HostTensor<YDataType>& y_m_n,
                               HostTensor<MeanDataType>& mean_m,
                               HostTensor<InvStdDataType>& invStd_m,
-                               ComputeDataType epsilon)
+                               ComputeDataType epsilon,
+                               Epilogue epilogue_functor = {})
 {
    auto layernorm2d_fwd_func = [&](auto m) {
        const int N = x_m_n.mDesc.get_lengths()[1];
@@ -51,16 +75,19 @@ void reference_layernorm2d_fwd(const HostTensor<XDataType>& x_m_n,
        if constexpr(!std::is_same_v<InvStdDataType, ck_tile::null_type>)
            invStd_m(m) = ck_tile::type_convert<InvStdDataType>(divisor);

+        HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
        for(int n = 0; n < N; ++n)
        {
            ComputeDataType x     = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
            ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
            ComputeDataType beta  = ck_tile::type_convert<ComputeDataType>(beta_n(n));
-            auto y                = (x - mean) * divisor;
-            y                     = y * gamma + beta;
+            auto a_               = (x - mean) * divisor;
+            a_                    = a_ * gamma + beta;

-            y_m_n(m, n) = ck_tile::type_convert<YDataType>(y);
+            acc(m, n) = a_;
        }
+
+        epilogue_functor(m, y_m_n, acc);
    };

    make_ParallelTensorFunctor(layernorm2d_fwd_func,

--- a/include/ck_tile/host/reference/reference_permute.hpp
+++ b/include/ck_tile/host/reference/reference_permute.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
@@ -4,9 +4,9 @@
 #pragma once

 #include "ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp"
-#include "ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp"
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp"
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp"
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp"
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
@@ -9,15 +9,16 @@
 namespace ck_tile {

 // host side args
+// X = A + B, Y = Rmsnorm2d(X), QY = RowwiseDynamicQuant(Y) = SaturateCast(Y / YScale)
 struct AddRmsnorm2dRdquantFwdHostArgs
 {
-    const void* p_a;
-    const void* p_b;
-    const void* p_gamma;
+    const void* p_a;     // [m ,n], input, fp16/bf16
+    const void* p_b;     // [m ,n], input, fp16/bf16
+    const void* p_gamma; // [1, n], gamma, prec same as input

-    void* p_x;
-    void* p_yscale;
-    void* p_qy;
+    void* p_x;      // [m, n], output, p_a + p_b, fp16/bf16
+    void* p_yscale; // [m, 1], output, rowwise quant scale (amax / 127) of reuslt of rmsnorm2d(x)
+    void* p_qy;     // [m, n], output, result of quant tensor of rmsnorm2d(x) int8

    float epsilon;

@@ -90,7 +91,7 @@ struct AddRmsnorm2dRdquantFwd

    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
    {
-        return integer_divide_ceil(hargs.m, Block_M);
+        return dim3(integer_divide_ceil(hargs.m, Block_M));
    }

    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
@@ -170,7 +171,7 @@ struct AddRmsnorm2dRdquantFwd
                number<1>{});

            const auto tmp2_ =
-                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadM>{});
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadN>{});

            return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
        }();

--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-
-namespace ck_tile {
-/*
-// clang-format off
-
-4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector
-
-                         Block_N (Warp_N * WarpPerBlock_N * Repeat_N )
-        +<----------------------< Repeat_N(2)>--------------------->+
-        |                                                           |
-        +<--    <WarpPerBlock_N(2)>  -->+
-            Warp_N
-        +--------------+--------------+--------------+--------------+----+----------------+
- Warp_M | wrap_0       | wrap_1       |                             |    ^                ^
-        +--------------+--------------+                             |   <WarpPerBlock_M(2)> |
-        | wrap_2       | wrap_3       |                             |    v
-        +--------------+--------------+--------------+--------------+----+           Block_M
-        |                             |                             |
-        +                             +                             |
-        |                             |                             |                     v
-        +--------------+--------------+--------------+--------------+                     +
-
-        each Warp-tile (e.g 16 thrd per row)
-
-         Vector_N (contiguous pixels each thrd holds along N, or vector size)
-        +-----------+-----------+-----------+-----------+-----------+
-        | thrd_0    | thrd_1    | thrd_2    | thrd_3    | ...         Vector_M
-        +-----------+-----------+-----------+-----------+-----------+
-        | thrd_16   | thrd_17   | thrd_18   | thrd_19   | ...
-        +-----------+-----------+-----------+-----------+-----------+
-// clang-format on
-*/
-template <typename BlockTile_,    // block size, seq<M, N>
-          typename WarpPerBlock_, // num warps along seq<M, N>
-          typename WarpTile_,     // warp size, seq<M, N>
-          typename Vector_,       // contiguous pixels(vector size) along seq<M, N>
-          index_t BlockSize_ =
-              warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})>
-struct AddRmsnorm2dRdquantShape
-{
-    // block size
-    static constexpr index_t Block_M = BlockTile_::at(number<0>{});
-    static constexpr index_t Block_N = BlockTile_::at(number<1>{});
-
-    // num warps along seq<M, N>, within each block
-    static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{});
-    static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{});
-
-    // warp size
-    static constexpr index_t Warp_M = WarpTile_::at(number<0>{});
-    static constexpr index_t Warp_N = WarpTile_::at(number<1>{});
-
-    static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0);
-    static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0);
-    // repeat of each thread along seq<M, N>
-    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
-    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
-
-    // vector size along seq<M, N>
-    static constexpr index_t Vector_M = Vector_::at(number<0>{});
-    static constexpr index_t Vector_N = Vector_::at(number<1>{});
-
-    static_assert(Warp_M % Vector_M == 0);
-    static_assert(Warp_N % Vector_N == 0);
-    // num of threads along seq<M, N>, within each warp
-    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
-
-    static constexpr index_t BlockSize = BlockSize_;
-};
-
-} // namespace ck_tile
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp
@@ -26,6 +26,7 @@ struct AddRmsnorm2dRdquantFwdPipelineDefaultPolicy
                sequence<1, 1, 2, 2>,
                sequence<0, 3, 0, 3>>{});
    }
+
    template <typename Problem>
    CK_TILE_DEVICE static constexpr auto MakeGammaBlockTileDistribution()
    {

--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck_tile/ops/common.hpp
+++ b/include/ck_tile/ops/common.hpp
@@ -3,4 +3,5 @@

 #pragma once

+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

-#include "ck_tile/core.hpp"
-
 namespace ck_tile {
+
 /*
 // clang-format off

@@ -42,7 +41,7 @@ template <typename BlockTile_,    // block size, seq<M, N>
          typename Vector_,       // contiguous pixels(vector size) along seq<M, N>
          index_t BlockSize_ =
              warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})>
-struct Rmsnorm2dShape
+struct Generic2dBlockShape
 {
    // block size
    static constexpr index_t Block_M = BlockTile_::at(number<0>{});

--- a/include/ck_tile/ops/elementwise.hpp
+++ b/include/ck_tile/ops/elementwise.hpp
@@ -4,4 +4,5 @@
 #pragma once

 #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/epilogue.hpp
+++ b/include/ck_tile/ops/epilogue.hpp
@@ -5,4 +5,6 @@

 #include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
 #include "ck_tile/ops/epilogue/default_2d_epilogue.hpp"
+#include "ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -9,23 +9,29 @@ namespace ck_tile {

 // this epilogue just store out a M*N matrix, row major

-template <typename AccDataType_, typename ODataType_, bool kPadM_, bool kPadN_>
+template <typename AccDataType_,
+          typename ODataType_,
+          bool kPadM_,
+          bool kPadN_,
+          bool UseRawStore_ = true>
 struct Default2DEpilogueProblem
 {
-    using AccDataType           = remove_cvref_t<AccDataType_>;
-    using ODataType             = remove_cvref_t<ODataType_>;
-    static constexpr bool kPadM = kPadM_;
-    static constexpr bool kPadN = kPadN_;
+    using AccDataType                 = remove_cvref_t<AccDataType_>;
+    using ODataType                   = remove_cvref_t<ODataType_>;
+    static constexpr bool kPadM       = kPadM_;
+    static constexpr bool kPadN       = kPadN_;
+    static constexpr bool UseRawStore = UseRawStore_;
 };

 template <typename Problem_, typename Policy_ = void>
 struct Default2DEpilogue
 {
-    using Problem               = remove_cvref_t<Problem_>;
-    using AccDataType           = remove_cvref_t<typename Problem::AccDataType>;
-    using ODataType             = remove_cvref_t<typename Problem::ODataType>;
-    static constexpr bool kPadM = Problem::kPadM;
-    static constexpr bool kPadN = Problem::kPadN;
+    using Problem                     = remove_cvref_t<Problem_>;
+    using AccDataType                 = remove_cvref_t<typename Problem::AccDataType>;
+    using ODataType                   = remove_cvref_t<typename Problem::ODataType>;
+    static constexpr bool kPadM       = Problem::kPadM;
+    static constexpr bool kPadN       = Problem::kPadN;
+    static constexpr bool UseRawStore = Problem::UseRawStore;

    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; }

@@ -36,7 +42,7 @@ struct Default2DEpilogue
    {

        // TODO: this is ugly
-        if constexpr(kPadM || kPadN)
+        if constexpr(UseRawStore && (kPadM || kPadN))
        {
            store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
            buffer_store_fence();

--- a/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+
+namespace ck_tile {
+
+template <bool kPadM_, bool kPadN_, bool UseRawStore_ = true, bool UseMax3_ = false>
+struct DynamicQuantEpilogueTraits
+{
+    static constexpr bool kPadM       = kPadM_;
+    static constexpr bool kPadN       = kPadN_;
+    static constexpr bool UseRawStore = UseRawStore_;
+    static constexpr bool UseMax3     = UseMax3_;
+};
+
+// this epilogue just store out a M*N matrix, row major
+template <typename AccDataType_,
+          typename YScaleDataType_,
+          typename ODataType_,
+          typename BlockShape_,
+          typename Traits_>
+struct DynamicQuantEpilogueProblem
+{
+    using AccDataType    = remove_cvref_t<AccDataType_>;
+    using YScaleDataType = remove_cvref_t<YScaleDataType_>;
+    using ODataType      = remove_cvref_t<ODataType_>;
+    using BlockShape     = remove_cvref_t<BlockShape_>; // can consum generic 2d shape
+    using Traits         = remove_cvref_t<Traits_>;
+};
+
+template <typename Problem_, typename Policy_ = void>
+struct DynamicQuantEpilogue
+{
+    using Problem                     = remove_cvref_t<Problem_>;
+    using AccDataType                 = remove_cvref_t<typename Problem::AccDataType>;
+    using YScaleDataType              = remove_cvref_t<typename Problem::YScaleDataType>;
+    using ODataType                   = remove_cvref_t<typename Problem::ODataType>;
+    using BlockShape                  = remove_cvref_t<typename Problem::BlockShape>;
+    static constexpr bool kPadM       = Problem::Traits::kPadM;
+    static constexpr bool kPadN       = Problem::Traits::kPadN;
+    static constexpr bool UseRawStore = Problem::Traits::UseRawStore;
+    static constexpr bool UseMax3     = Problem::Traits::UseMax3;
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d()
+    {
+        using P_ = BlockReduce2dProblem<AccDataType, AccDataType, BlockShape>;
+        return BlockReduce2d<P_>{};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync()
+    {
+        using P_ = BlockReduce2dProblem<AccDataType, AccDataType, BlockShape>;
+        return BlockReduce2dSync<P_>{};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync()
+    {
+        using P_ = BlockReduce2dProblem<AccDataType, AccDataType, BlockShape>;
+        return BlockReduce2dCrossWarpSync<P_>{};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync();
+        return reduce_crosswarp_sync.GetSmemSize();
+    }
+
+    // TODO: this function assume store out vector size is the same as OAccTile last dimension size
+    //       how do we fix this ?
+    template <typename ODramWindowTmp, typename YScaleWindow, typename OAccTile>
+    CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
+                                   YScaleWindow& y_scale_window,
+                                   const OAccTile& o_acc_tile,
+                                   void* smem)
+    {
+        auto reduce                = GetBlockReduce2d();
+        auto reduce_sync           = GetBlockReduce2dSync();
+        auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync();
+
+        const auto f_absmax = [](auto acc_, auto v_0_) { return max(acc_, abs(v_0_)); };
+
+        auto row_absmax = [&]() {
+            constexpr auto y_size_per_row =
+                OAccTile{}.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(
+                    number<1>{});
+            // constexpr auto y_size_per_row = OAccTile::get_lengths()[number<1>{}];
+            if constexpr(UseMax3 && std::is_same_v<AccDataType, float> && y_size_per_row % 2 == 0)
+            {
+                // fast max3 implementation
+                const auto f_max3 = [](auto acc_, auto v_0_, auto v_1_) {
+                    float rtn;
+                    asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                                 : "=v"(rtn)
+                                 : "v"(acc_), "v"(v_0_), "v"(v_1_));
+                    return rtn;
+                };
+                return reduce(o_acc_tile, type_convert<AccDataType>(0), f_max3, sequence<1, 2>{});
+            }
+            else
+            {
+                return reduce(o_acc_tile, type_convert<AccDataType>(0), f_absmax);
+            }
+        }();
+        reduce_sync(row_absmax, f_absmax);
+        reduce_crosswarp_sync(row_absmax, smem, f_absmax);
+
+        // here y_scale is Acc TYpe, need convert to YScale type later
+        auto y_scale = tile_elementwise_in(
+            [&](const auto& v_) {
+                return v_ / type_convert<AccDataType>(numeric<ODataType>::max());
+            },
+            row_absmax);
+
+        store_tile(y_scale_window, cast_tile<YScaleDataType>(y_scale));
+
+        auto o_acc_scaled_tile =
+            make_static_distributed_tensor<AccDataType>(o_acc_tile.get_tile_distribution());
+
+        sweep_tile(o_acc_tile, [&](auto idx) {
+            constexpr auto row_id  = make_tuple(idx[number<0>{}]);
+            o_acc_scaled_tile(idx) = o_acc_tile[idx] / y_scale(row_id);
+        });
+
+        // TODO: this is ugly
+        if constexpr(UseRawStore && (kPadM || kPadN))
+        {
+            store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_scaled_tile));
+            buffer_store_fence();
+        }
+        else
+        {
+            store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_scaled_tile));
+        }
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -43,4 +43,5 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once