fixing merge errors: unary_element_wise_operation.hpp

7c56cd01 · Astha Rai · 7d3ee266 · cb6c5d39 · 7c56cd01 · 7c56cd01
Commit 7c56cd01 authored Nov 04, 2024 by Astha Rai
20 changed files
--- a/include/ck_tile/ops/smoothquant.hpp
+++ b/include/ck_tile/ops/smoothquant.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+namespace ck_tile {
+// host side args
+struct SmoothquantHostArgs
+{
+    const void* p_x;      // [m ,n], input, fp16/bf16
+    const void* p_xscale; // [1, n], input, columnwise scale, fp32
+    void* p_yscale; // [m, 1], output, rowwise quant scale (amax / 127) of (p_x * p_xscale)
+    void* p_qy;     // [m, n], output, p_x * p_xscale / p_yscale
+    index_t m;
+    index_t n;
+    index_t stride; // row_stride
+};
+// TODO: Extract some type to wrapper class
+template <typename Pipeline_>
+struct Smoothquant
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = typename Pipeline::Problem;
+    using XDataType       = remove_cvref_t<typename Problem::XDataType>;
+    using XScaleDataType  = remove_cvref_t<typename Problem::XScaleDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+    using YScaleDataType  = remove_cvref_t<typename Problem::YScaleDataType>;
+    using QYDataType      = remove_cvref_t<typename Problem::QYDataType>;
+    static constexpr index_t Block_M = Problem::BlockShape::Block_M;
+    static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+    static constexpr bool kPadM      = false; // always no need to pad along M
+    static constexpr bool kPadN      = Problem::kPadN;
+    static constexpr bool kTwoPass   = Problem::kTwoPass;
+    static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
+    static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
+    static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    struct Kargs
+    {
+        const void* p_x;
+        const void* p_xscale;
+        void* p_yscale;
+        void* p_qy;
+        index_t m;
+        index_t n;
+        index_t stride; // row_stride
+    };
+    using Hargs = SmoothquantHostArgs;
+    CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
+    {
+        return Kargs{
+            hargs.p_x, hargs.p_xscale, hargs.p_yscale, hargs.p_qy, hargs.m, hargs.n, hargs.stride};
+    }
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
+    {
+        return dim3(integer_divide_ceil(hargs.m, Block_M));
+    }
+    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+    // clang-format off
+    template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
+    template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
+    template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
+    template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    // clang-format on
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); }
+    CK_TILE_HOST static std::string GetName()
+    {
+        // clang-format off
+        using S_ = typename Problem::BlockShape;
+        auto surfix = [&] () {
+            std::string n;
+            if (kPadN) n += "_pn";
+            if (kTwoPass) n += "_2p";
+            return n; }();
+        #define _SS_  std::string
+        #define _TS_  std::to_string
+        return _SS_("smoothquant_fwd_") + _SS_(t2s<XDataType>::name) + "_" +
+             _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" +
+             _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" +
+             _SS_(Pipeline::name) + surfix;
+        #undef _SS_
+        #undef _TS_
+        // clang-format on
+    }
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        const auto iM = get_block_id() * Block_M;
+        const auto x_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XDataType*>(kargs.p_x),
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+            const auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+        }();
+        const auto xscale_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XScaleDataType*>(kargs.p_xscale),
+                make_tuple(kargs.n),
+                make_tuple(1),
+                number<Vector_N>{},
+                number<1>{});
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadN>{});
+            return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
+        }();
+        auto yscale_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<YScaleDataType*>(kargs.p_yscale),
+                make_tuple(kargs.m),
+                make_tuple(1),
+                number<1>{});
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_M>{}), sequence<kPadM>{});
+            return make_tile_window(tmp2_, make_tuple(number<Block_M>{}), {iM});
+        }();
+        auto qy_window = [&]() {
+            auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<QYDataType*>(kargs.p_qy),
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+            auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+        }();
+        __shared__ char smem[GetSmemSize()];
+        Pipeline{}(x_window, xscale_window, yscale_window, qy_window, kargs.n, smem);
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
+namespace ck_tile {
+struct SmoothquantPipelineDefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::Vector_M>,
+                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<1, 2>, sequence<1, 2>>,
+                tuple<sequence<1, 1>, sequence<2, 2>>,
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXScaleBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<S::WarpPerBlock_M, S::ThreadPerWarp_M>,
+                tuple<sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<0, 1>, sequence<0, 1>>,
+                tuple<sequence<0, 1>, sequence<1, 2>>,
+                sequence<1, 1>,
+                sequence<0, 3>>{});
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2d<P_>{};
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dSync<P_>{};
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dCrossWarpSync<P_>{};
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        if constexpr(Problem::kNeedCrossWarpSync)
+        {
+            using P_ = BlockReduce2dProblem<typename Problem::XDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::BlockShape>;
+            using block_reduce2d = BlockReduce2d<P_>;
+            using x_block_tile =
+                decltype(make_static_distributed_tensor<typename Problem::XDataType>(
+                    MakeXBlockTileDistribution<Problem>()));
+            using y_block_tile = decltype(block_reduce2d::template MakeYBlockTile<x_block_tile>());
+            return GetBlockReduce2dCrossWarpSync<Problem>().template GetSmemSize<y_block_tile>();
+        }
+        else
+        {
+            return 1; // zero size arrays are an extension
+        }
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+namespace ck_tile {
+template <typename Problem_, typename Policy_ = SmoothquantPipelineDefaultPolicy>
+struct SmoothquantPipelineOnePass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using XScaleDataType  = ck_tile::remove_cvref_t<typename Problem::XScaleDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using QYDataType      = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
+    using YScaleDataType  = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
+    static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr_op"; // block per row
+        else
+            return "wpr_op"; // warp per row
+    }();
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+    template <typename XWindow, typename XScaleWindow, typename QYWindow, typename YScaleWindow>
+    CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const XScaleWindow& xscale_window_,
+                                   YScaleWindow& yscale_window,
+                                   QYWindow& qy_window,
+                                   ck_tile::index_t,
+                                   void* smem) const
+    {
+        auto x_window =
+            make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        auto xscale_window = make_tile_window(
+            xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());
+        auto reduce_absmax_func  = ReduceOp::AbsMax{};
+        auto reduce_max_func     = ReduceOp::Max{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+        const auto x      = load_tile(x_window);
+        const auto xscale = load_tile(xscale_window);
+        auto y            = tile_elementwise_in(
+            [&](const auto& a, const auto& b) {
+                return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
+            },
+            x,
+            xscale);
+        // compute absmax, cross-lane->cross-warp
+        auto absmax = block_reduce2d(
+            y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+        block_reduce2d_sync(absmax, reduce_max_func);
+        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);
+        // ex: yscale = absmax / 127 if int8
+        auto yscale = tile_elementwise_in(
+            [&](const auto& v_) {
+                return v_ / type_convert<ComputeDataType>(numeric<QYDataType>::max());
+            },
+            absmax);
+        store_tile(yscale_window, cast_tile<YScaleDataType>(yscale));
+        // quantize y to qy
+        auto qy = make_static_distributed_tensor<QYDataType>(y.get_tile_distribution());
+        sweep_tile(qy, [&](auto idx) {
+            constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+            auto qy_             = y[idx] / yscale[i_idx];
+            qy(idx)              = saturates<QYDataType>{}(qy_);
+        });
+        store_tile(qy_window, qy);
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core/utility/type_traits.hpp"
+namespace ck_tile {
+// Y = X * XScale, QY = RowwiseDynamicQuant(Y) = SaturateCast(Y / YScale)
+template <typename XDataType_,
+          typename XScaleDataType_,
+          typename ComputeDataType_,
+          typename YScaleDataType_,
+          typename QYDataType_,
+          typename BlockShape_,
+          bool kPadN_,
+          bool kTwoPass_>
+struct SmoothquantPipelineProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using XScaleDataType  = remove_cvref_t<XScaleDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using YScaleDataType  = remove_cvref_t<YScaleDataType_>;
+    using QYDataType      = remove_cvref_t<QYDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
+    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
+    static constexpr bool kPadN    = kPadN_;
+    static constexpr bool kTwoPass = kTwoPass_;
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+namespace ck_tile {
+template <typename Problem_, typename Policy_ = SmoothquantPipelineDefaultPolicy>
+struct SmoothquantPipelineTwoPass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using XScaleDataType  = ck_tile::remove_cvref_t<typename Problem::XScaleDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using QYDataType      = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
+    using YScaleDataType  = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
+    static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr_tp"; // block per row
+        else
+            return "wpr_tp"; // warp per row
+    }();
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+    template <typename XWindow, typename XScaleWindow, typename QYWindow, typename YScaleWindow>
+    CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const XScaleWindow& xscale_window_,
+                                   YScaleWindow& yscale_window,
+                                   QYWindow& qy_window,
+                                   ck_tile::index_t row_size,
+                                   void* smem) const
+    {
+        auto x_window =
+            make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        auto xscale_window = make_tile_window(
+            xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());
+        static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+        auto reduce_absmax_func  = ReduceOp::AbsMax{};
+        auto reduce_max_func     = ReduceOp::Max{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+        using XTensorType = decltype(cast_tile<ComputeDataType>(load_tile(x_window)));
+        auto absmax       = block_reduce2d.template MakeYBlockTile<XTensorType>();
+        set_tile(absmax, reduce_absmax_func.GetIdentityValue<ComputeDataType>());
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x      = load_tile(x_window);
+            const auto xscale = load_tile(xscale_window);
+            const auto y      = tile_elementwise_in(
+                [&](const auto& a, const auto& b) {
+                    return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
+                },
+                x,
+                xscale);
+            block_reduce2d(y, absmax, reduce_absmax_func);
+            move_tile_window(x_window, {0, Block_N});
+            move_tile_window(xscale_window, {Block_N});
+        }
+        // compute absmax, cross-lane->cross-warp
+        block_reduce2d_sync(absmax, reduce_max_func);
+        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);
+        // ex: yscale = absmax / 127 if int8
+        auto yscale = tile_elementwise_in(
+            [&](const auto& v_) {
+                return v_ / type_convert<ComputeDataType>(numeric<QYDataType>::max());
+            },
+            absmax);
+        store_tile(yscale_window, cast_tile<YScaleDataType>(yscale));
+        // reverse read x to reuse cache
+        ck_tile::index_t stride_to_right_most_window =
+            row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N;
+        move_tile_window(x_window, {0, -Block_N});
+        move_tile_window(xscale_window, {-Block_N});
+        move_tile_window(qy_window, {0, stride_to_right_most_window});
+        // recompute y and quantize y to qy
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x      = load_tile(x_window);
+            const auto xscale = load_tile(xscale_window);
+            const auto y      = tile_elementwise_in(
+                [&](const auto& a, const auto& b) {
+                    return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
+                },
+                x,
+                xscale);
+            auto qy = make_static_distributed_tensor<QYDataType>(y.get_tile_distribution());
+            sweep_tile(qy, [&](auto idx) {
+                constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+                auto qy_             = y[idx] / yscale[i_idx];
+                qy(idx)              = saturates<QYDataType>{}(qy_);
+            });
+            store_tile(qy_window, qy);
+            move_tile_window(x_window, {0, -Block_N});
+            move_tile_window(xscale_window, {0, -Block_N});
+            move_tile_window(qy_window, {0, -Block_N});
+        }
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/softmax.hpp
+++ b/include/ck_tile/ops/softmax.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/ops/softmax/block/block_softmax_2d.hpp"
+#include "ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/softmax/block/block_softmax_2d.hpp
+++ b/include/ck_tile/ops/softmax/block/block_softmax_2d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#define _BLOCK_SOFTMAX_USE_UNPACK2 0
+namespace ck_tile {
+/*
+simple 2d softmax implementation, along row (dim=1)
+requirement:
+    1). each row is within a warp
+    2). data type must be a dword
+*/
+template <typename Problem_, typename Policy_ = void>
+struct BlockSoftmax2D
+{
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+    using DataType = typename Problem::DataType;
+    template <typename DistributedTensor, index_t dim = 1>
+    CK_TILE_DEVICE void
+    operator()(const DistributedTensor& x, DistributedTensor& y, number<dim> = {})
+    {
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+#if _BLOCK_SOFTMAX_USE_UNPACK2
+        const auto f_max3 = [](auto e0, auto e1, auto e2) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, %2, %3" : "=v"(rtn) : "v"(e0), "v"(e1), "v"(e2));
+            return rtn;
+        };
+        const auto f_sum3 = [](auto e0, auto e1, auto e2) { return e0 + e1 + e2; };
+#endif
+        // compute row max
+        auto reduce_row_max = BlockReduce2D{x, -numeric<DataType>::infinity()};
+#if _BLOCK_SOFTMAX_USE_UNPACK2
+        auto row_max = reduce_row_max(f_max3, f_max, sequence<1, 2>{});
+#else
+        auto row_max = reduce_row_max(f_max);
+#endif
+        sweep_tile<DistributedTensor>([&](auto idx) {
+            constexpr auto row_id = make_tuple(idx[number<0>{}]);
+            y(idx)                = exp(x[idx] - row_max[row_id]);
+        });
+        // compute row sum
+        auto reduce_row_sum = BlockReduce2D<decltype(y)>{y, DataType{0}};
+#if _BLOCK_SOFTMAX_USE_UNPACK2
+        auto row_sum = reduce_row_sum(f_sum3, f_sum, sequence<1, 2>{});
+#else
+        auto row_sum = reduce_row_sum(f_sum);
+#endif
+        // reciprocal
+        auto r = make_static_distributed_tensor<DataType>(row_sum.get_tile_distribution());
+        sweep_tile(row_sum, [&](auto idx) { r(idx) = DataType{1} / row_sum(idx); });
+        // scale
+        sweep_tile<DistributedTensor>([&](auto idx) {
+            constexpr auto row_id = make_tuple(idx[number<0>{}]);
+            y(idx)                = y(idx) * r(row_id);
+        });
+    }
+    template <typename DistributedTensor, index_t dim = 1>
+    CK_TILE_DEVICE decltype(auto) operator()(const DistributedTensor& x, number<dim> = {})
+    {
+        auto y = DistributedTensor{}; // distributed tensor
+        operator()(x, y, number<dim>{});
+        return y;
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp
+++ b/include/ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+template <typename DataType_>
+struct BlockSoftmax2DProblem
+{
+    using DataType = remove_cvref_t<DataType_>;
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/topk.hpp
+++ b/include/ck_tile/ops/topk.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/ops/topk/block/block_topk_stream_2d.hpp"
+#include "ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/topk/block/block_topk_stream_2d.hpp
+++ b/include/ck_tile/ops/topk/block/block_topk_stream_2d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+/*
+simple 2d topk implementation, along row (dim=1)
+requirement:
+    1). each row is within a warp
+*/
+template <typename Problem_, typename Policy_ = void>
+struct BlockTopkStream2D
+{
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+    using DataType  = typename Problem::DataType;
+    using IndexType = typename Problem::IndexType;
+    // TODO: if DataType is subdword, need pack into single dword to use argmax
+    struct ArgmaxPacket
+    {
+        DataType arg;
+        index_t value;
+    };
+    template <typename DistributedTensor, typename OutWindow, typename IdxWindow, index_t dim = 1>
+    CK_TILE_DEVICE void operator()(const DistributedTensor& x,
+                                   const OutWindow& out_window,
+                                   const IdxWindow& idx_window,
+                                   index_t k,
+                                   number<dim> = {})
+    {
+        OutWindow out_window_tmp = out_window;
+        IdxWindow idx_window_tmp = idx_window;
+        static_assert(
+            std::is_same_v<typename DistributedTensor::DataType, typename OutWindow::DataType> &&
+            std::is_same_v<typename DistributedTensor::DataType, DataType>);
+        static_assert(std::is_same_v<typename IdxWindow::DataType, IndexType>);
+        DistributedTensor x_tmp = x;
+        constexpr auto dst_dist = typename IdxWindow::TileDstr{};
+        // argmax for topk
+        const auto f_argmax = [](ArgmaxPacket e0, ArgmaxPacket e1) {
+            return e0.arg > e1.arg ? e0 : e1;
+        };
+        for(index_t i_k = 0; i_k < k; i_k++)
+        {
+            constexpr auto span_2d = DistributedTensor::get_distributed_spans();
+            auto packet            = [&]() {
+                auto tmp = make_static_distributed_tensor<ArgmaxPacket>(x.get_tile_distribution());
+                sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) {
+                    sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) {
+                        const auto tile_idx = get_x_indices_from_distributed_indices(
+                            tmp.get_tile_distribution(), make_tuple(idx0, idx1));
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                        ArgmaxPacket t;
+                        t.arg        = x_tmp(i_j_idx); // !!! we reference x here
+                        t.value      = tile_idx.at(number<1>{});
+                        tmp(i_j_idx) = t;
+                    });
+                });
+                return tmp;
+            }();
+            auto argmax_init = ArgmaxPacket{-numeric<DataType>::infinity(), 0};
+            auto r = block_tile_reduce<ArgmaxPacket>(packet, sequence<1>{}, f_argmax, argmax_init);
+            block_tile_reduce_xor_sync(r, f_argmax);
+            auto o = make_static_distributed_tensor<DataType>(dst_dist);
+            auto i = make_static_distributed_tensor<IndexType>(dst_dist);
+            sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) {
+                sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    ArgmaxPacket tmp       = r(i_j_idx);
+                    o(i_j_idx)             = tmp.arg;
+                    i(i_j_idx)             = tmp.value;
+                });
+            });
+            // update value
+            sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) {
+                sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) {
+                    const auto tile_idx = get_x_indices_from_distributed_indices(
+                        x.get_tile_distribution(), make_tuple(idx0, idx1));
+                    auto col_id = tile_idx.at(number<1>{});
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    x_tmp(i_j_idx) = (col_id == r(i_j_idx).value) ? -numeric<DataType>::infinity()
+                                                                  : x_tmp(i_j_idx);
+                });
+            });
+            if(threadIdx.x % Problem::ColLanes == 0)
+            {
+                store_tile(out_window_tmp, o);
+                store_tile(idx_window_tmp, i);
+            }
+            move_tile_window(out_window_tmp, {number<0>{}, number<1>{}});
+            move_tile_window(idx_window_tmp, {number<0>{}, number<1>{}});
+        }
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp
+++ b/include/ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+/*
+simple 2d topk implementation, along row (dim=1)
+requirement:
+    1). each row is within a warp
+*/
+template <typename DataType_, typename IndexType_, index_t ColLanes_>
+struct BlockTopkStream2DProblem
+{
+    using DataType                    = remove_cvref_t<DataType_>;
+    using IndexType                   = remove_cvref_t<IndexType_>;
+    static constexpr index_t ColLanes = ColLanes_;
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/topk_softmax.hpp
+++ b/include/ck_tile/ops/topk_softmax.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp"
+#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp"
+#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp"
+#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
+++ b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include <string>
+#include <type_traits>
+namespace ck_tile {
+struct TopkSoftmaxHostArgs
+{
+    const void* p_input;
+    void* p_output;
+    void* p_indices;
+    index_t num_rows;
+    index_t num_experts;
+    index_t topk;
+    index_t stride_input;  // row stride for input, at least experts
+    index_t stride_output; // row stride for output/indices, at least tpok
+};
+template <typename Pipeline_>
+struct TopkSoftmaxKernel
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = remove_cvref_t<typename Pipeline::Problem>;
+    using InputType  = typename Problem::InputType;
+    using WeightType = typename Problem::WeightType;
+    using IndexType  = typename Problem::IndexType;
+    struct TopkSoftmaxKargs
+    {
+        const void* p_input;
+        void* p_output;
+        void* p_indices;
+        index_t num_rows;
+        index_t num_experts;
+        index_t topk;
+        index_t stride_input;  // row stride for input, at least experts
+        index_t stride_output; // row stride for output/indices, at least tpok
+    };
+    using Kargs = TopkSoftmaxKargs;
+    using Hargs = TopkSoftmaxHostArgs;
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
+    {
+        if constexpr(Problem::LaunchType > 0)
+        {
+            int num_cu = [&]() {
+                hipDeviceProp_t dev_prop;
+                hipDevice_t dev;
+                HIP_CHECK_ERROR(hipGetDevice(&dev));
+                HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
+                return dev_prop.multiProcessorCount;
+            }();
+            return dim3(num_cu * Problem::LaunchType);
+        }
+        else
+        {
+            const int num_warps = (h.num_rows + Problem::RowsPerWarp - 1) / Problem::RowsPerWarp;
+            const int num_blocks =
+                (num_warps + Problem::WarpsPerBlock - 1) / Problem::WarpsPerBlock;
+            return dim3(num_blocks);
+        }
+    }
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_input       = h.p_input;
+        k.p_output      = h.p_output;
+        k.p_indices     = h.p_indices;
+        k.num_rows      = h.num_rows;
+        k.num_experts   = h.num_experts;
+        k.topk          = h.topk;
+        k.stride_input  = h.stride_input;
+        k.stride_output = h.stride_output;
+        return k;
+    }
+    CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; }
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        index_t block_row_id = static_cast<index_t>(blockIdx.x * Problem::RowsPerBlock);
+        if(block_row_id > kargs.num_rows)
+            return;
+        index_t block_os_inp = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_input);
+        index_t block_os_out = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_output);
+        index_t num_rows_rem = __builtin_amdgcn_readfirstlane(kargs.num_rows - block_row_id);
+        const auto input_window = [&]() {
+            const InputType* p_input =
+                reinterpret_cast<const InputType*>(kargs.p_input) + block_os_inp;
+            auto tmp = make_naive_tensor_view<address_space_enum::global>(
+                p_input,
+                make_tuple(num_rows_rem, kargs.num_experts),
+                make_tuple(kargs.stride_input, 1),
+                number<Problem::VectorSize>{},
+                number<1>{});
+            auto view = pad_tensor_view(
+                tmp,
+                make_tuple(number<Problem::RowsPerBlock>{}, number<Problem::Experts>{}),
+                sequence<0, 1>{}); // out-most dim no need pad(leverage oob)
+            return make_tile_window(
+                view,
+                make_tuple(number<Problem::RowsPerBlock>{}, number<Problem::Experts>{}),
+                {0, 0});
+        }();
+        auto output_window = [&]() {
+            WeightType* p_output = reinterpret_cast<WeightType*>(kargs.p_output) + block_os_out;
+            auto tmp             = make_naive_tensor_view<address_space_enum::global>(
+                p_output,
+                make_tuple(num_rows_rem, kargs.topk),
+                make_tuple(kargs.stride_output, 1),
+                number<Problem::VectorSize>{},
+                number<1>{});
+            auto view =
+                pad_tensor_view(tmp,
+                                make_tuple(number<Problem::RowsPerBlock>{}, number<1>{}),
+                                sequence<0, 0>{}); // 1. out-most dim no need pad(leverage oob)
+                                                   // 2. we loop over topk 1-1, no need padding
+            return make_tile_window(
+                view, make_tuple(number<Problem::RowsPerBlock>{}, number<1>{}), {0, 0});
+        }();
+        auto indices_window = [&]() {
+            IndexType* p_indices = reinterpret_cast<IndexType*>(kargs.p_indices) + block_os_out;
+            auto tmp             = make_naive_tensor_view<address_space_enum::global>(
+                p_indices,
+                make_tuple(num_rows_rem, kargs.topk),
+                make_tuple(kargs.stride_output, 1),
+                number<Problem::VectorSize>{},
+                number<1>{});
+            auto view =
+                pad_tensor_view(tmp,
+                                make_tuple(number<Problem::RowsPerBlock>{}, number<1>{}),
+                                sequence<0, 0>{}); // 1. out-most dim no need pad(leverage oob)
+                                                   // 2. we loop over topk 1-1, no need padding
+            return make_tile_window(
+                view, make_tuple(number<Problem::RowsPerBlock>{}, number<1>{}), {0, 0});
+        }();
+        Pipeline{}(input_window,
+                   output_window,
+                   indices_window,
+                   kargs.num_rows,
+                   kargs.num_experts,
+                   kargs.topk,
+                   block_row_id);
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
+++ b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp"
+#include <string>
+#include <type_traits>
+#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
+#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 0
+#endif
+namespace ck_tile {
+template <typename Problem_, typename Policy_ = TopkSoftmaxWarpPerRowPolicy>
+struct TopkSoftmaxWarpPerRowPipeline
+{
+    // TODO: this kernel only support warp per row
+    using Problem    = remove_cvref_t<Problem_>;
+    using Policy     = remove_cvref_t<Policy_>;
+    using WeightType = typename Problem::WeightType;
+    template <typename InputWindow, typename OutputWindow, typename IndexWindow>
+    CK_TILE_DEVICE auto operator()(const InputWindow& input_window,
+                                   OutputWindow& out_window,
+                                   IndexWindow& idx_window,
+                                   index_t rows,
+                                   index_t experts,
+                                   index_t k,
+                                   index_t block_row_id)
+    {
+#if TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
+        auto inp_win = make_tile_window_linear_raw(
+            input_window, Policy::template MakeInputDistribution<Problem>(), sequence<0, 1>{});
+#else
+        auto inp_win = make_tile_window_linear(
+            input_window, Policy::template MakeInputDistribution<Problem>(), sequence<0, 1>{});
+#endif
+        auto out_win = make_tile_window_linear(out_window.get_bottom_tensor_view(),
+                                               out_window.get_window_lengths(),
+                                               out_window.get_window_origin(),
+                                               Policy::template MakeOutputDistribution<Problem>());
+        auto idx_win = make_tile_window_linear(idx_window.get_bottom_tensor_view(),
+                                               idx_window.get_window_lengths(),
+                                               idx_window.get_window_origin(),
+                                               Policy::template MakeOutputDistribution<Problem>());
+        auto softmax = Policy::template GetSoftmax<Problem>();
+        auto topk    = Policy::template GetTopk<Problem>();
+        const index_t grid_rows_per_loop = gridDim.x * Problem::RowsPerBlock;
+        while(1)
+        {
+#if TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
+            __builtin_amdgcn_sched_barrier(0);
+            auto x =
+                load_tile_raw(inp_win, number<-1>{}, bool_constant<true>{}, bool_constant<true>{});
+            buffer_load_fence(number<0>{});
+            __builtin_amdgcn_sched_barrier(0);
+#else
+            auto x = load_tile(inp_win);
+#endif
+            // cast and pad input data
+            auto w = [&]() {
+#if 0
+                auto w_ = cast_tile<WeightType>(x);
+                constexpr auto span_2d = decltype(w_)::get_distributed_spans();
+                sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) {
+                    sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) {
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                        const auto x_indices   = get_x_indices_from_distributed_indices(
+                            w_.get_tile_distribution(), i_j_idx);
+                        const auto current_expert = x_indices.at(number<1>{});
+                        // set to -INF if OOB so that later softmax can work properly
+                        w_(i_j_idx) = current_expert >= experts ? -numeric<WeightType>::infinity()
+                                                                : w_(i_j_idx);
+                    });
+                });
+                return w_;
+#else
+                auto w_  = make_static_distributed_tensor<WeightType>(x.get_tile_distribution());
+                auto w_f = [&](auto idx) {
+                    w_(idx) = type_convert<WeightType>(x(idx));
+                    const auto x_indices =
+                        get_x_indices_from_distributed_indices(w_.get_tile_distribution(), idx);
+                    const auto current_expert = x_indices.at(number<1>{});
+                    w_(idx) =
+                        current_expert >= experts ? -numeric<WeightType>::infinity() : w_(idx);
+                };
+                tile_sweeper ts{w_, w_f};
+                ts();
+                return w_;
+#endif
+            }();
+            // softmax
+            auto y = softmax(w);
+            topk(y, out_win, idx_win, k);
+            // check exit
+            if constexpr(Problem::LaunchType == 0)
+            {
+                break;
+            }
+            else
+            {
+                block_row_id += grid_rows_per_loop;
+                if(block_row_id >= rows)
+                    break;
+            }
+            move_tile_window(inp_win, {grid_rows_per_loop, number<0>{}});
+            move_tile_window(out_win, {grid_rows_per_loop, number<0>{}});
+            move_tile_window(idx_win, {grid_rows_per_loop, number<0>{}});
+        }
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp
+++ b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/softmax.hpp"
+#include "ck_tile/ops/topk.hpp"
+namespace ck_tile {
+struct TopkSoftmaxWarpPerRowPolicy
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeInputDistribution()
+    {
+        // TODO: Y dim must have one dim that is not reduced
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<1>,
+                tuple<sequence<Problem::IssuesPerCol,
+                               Problem::WarpsPerBlock,
+                               Problem::RowsPerWarpPerColIssue>,
+                      sequence<Problem::IssuesPerRow, Problem::LanesPerRow, Problem::VectorSize>>,
+                tuple<sequence<1>, sequence<1, 2>>,
+                tuple<sequence<1>, sequence<2, 1>>,
+                sequence<1, 2, 2>,
+                sequence<0, 0, 2>>{});
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeOutputDistribution()
+    {
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<Problem::LanesPerRow>, // repeat this one
+                                       tuple<sequence<Problem::IssuesPerCol,
+                                                      Problem::WarpsPerBlock,
+                                                      Problem::RowsPerWarpPerColIssue>,
+                                             sequence<1>>, // each row write out single element
+                                       tuple<sequence<1>, sequence<1, 0>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{});
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSoftmax()
+    {
+        using softmax_problem = BlockSoftmax2DProblem<typename Problem::WeightType>;
+        return BlockSoftmax2D<softmax_problem>{};
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetTopk()
+    {
+        using topk_problem = BlockTopkStream2DProblem<typename Problem::WeightType,
+                                                      typename Problem::IndexType,
+                                                      Problem::LanesPerRow>;
+        // Note: replicate is LanesPerRow
+        return BlockTopkStream2D<topk_problem>{};
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp
+++ b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include <string>
+#include <type_traits>
+namespace ck_tile {
+template <typename InputType_,
+          typename WeightType_,
+          typename IndexType_,
+          index_t Experts_,
+          index_t IssuesPerCol_  = 2, // issue along col, to make sure block_reduce() OK
+          index_t BytesPerIssue_ = sizeof(InputType_),
+          index_t LaunchType_    = 0, // 0-streaming, >0, persistent #occupancy
+          index_t BlockSize_     = 256>
+struct TopkSoftmaxWarpPerRowProblem
+{
+    // TODO: this kernel only support warp per row
+    using InputType  = remove_cvref_t<InputType_>;
+    using WeightType = remove_cvref_t<WeightType_>;
+    using IndexType  = remove_cvref_t<IndexType_>;
+    static constexpr index_t LaunchType    = LaunchType_;
+    static constexpr index_t Experts       = Experts_;
+    static constexpr index_t BytesPerIssue = BytesPerIssue_;
+    static constexpr index_t IssuesPerCol  = IssuesPerCol_;
+    static constexpr index_t BlockSize     = BlockSize_;
+    static constexpr index_t WarpSize      = get_warp_size();
+    static_assert(BytesPerIssue % sizeof(InputType) == 0);
+    static constexpr index_t VectorSize = BytesPerIssue / sizeof(InputType);
+    static_assert(Experts % VectorSize == 0);
+    static constexpr index_t LanesPerRow = min(Experts / VectorSize, WarpSize);
+    static_assert(WarpSize % LanesPerRow == 0);
+    static constexpr index_t RowsPerWarpPerColIssue = WarpSize / LanesPerRow;
+    static constexpr index_t RowsPerWarp            = IssuesPerCol * RowsPerWarpPerColIssue;
+    static constexpr index_t IssuesPerRow           = Experts / (LanesPerRow * VectorSize);
+    static constexpr index_t WarpsPerBlock = BlockSize / WarpSize;
+    static constexpr index_t RowsPerBlock  = RowsPerWarp * WarpsPerBlock;
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/welford.hpp
+++ b/include/ck_tile/ops/welford.hpp
@@ -3,6 +3,8 @@
 #pragma once
+#include "ck_tile/ops/welford/block/block_welford.hpp"
+#include "ck_tile/ops/welford/block/block_welford_problem.hpp"
 #include "ck_tile/ops/welford/thread/thread_welford.hpp"
-#include "ck_tile/ops/welford/warp/warp_welford.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/welford/block/block_welford.hpp
+++ b/include/ck_tile/ops/welford/block/block_welford.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/welford/thread/thread_welford.hpp"
+namespace ck_tile {
+template <typename Problem_, typename Policy_ = void>
+struct BlockWelford
+{
+    using Problem         = remove_cvref_t<Problem_>;
+    using XDataType       = typename Problem::XDataType;
+    using ComputeDataType = typename Problem::ComputeDataType;
+    CK_TILE_DEVICE constexpr BlockWelford() {}
+    // [CAUSION] - max_count_ is to deal with the padding problem
+    // max_count_ is depend on caller, eg: naive and splitN welford will have different
+    // calculation of max_count_
+    // -> use block_welford_calculate_max_count to compute
+    template <typename XDistributedTensor_,
+              typename MeanDistributedTensor_,
+              typename VarDistributedTensor_>
+    CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor,
+                                   MeanDistributedTensor_& mean_tensor,
+                                   VarDistributedTensor_& var_tensor,
+                                   int& cur_count_, // -> prefer init as zero
+                                   const int& max_count_)
+    {
+        constexpr auto I0 = number<0>{};
+        constexpr auto I1 = number<1>{};
+        constexpr auto spans = XDistributedTensor_::get_distributed_spans();
+        sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) {
+            if(cur_count_ < max_count_)
+            {
+                ++cur_count_;
+                sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) {
+                    constexpr auto in_dstr_idx  = make_tuple(dstr_idx_i0, dstr_idx_i1);
+                    constexpr auto out_dstr_idx = make_tuple(dstr_idx_i0);
+                    auto x = ck_tile::type_convert<ComputeDataType>(x_tensor[in_dstr_idx]);
+                    welford_update(
+                        mean_tensor(out_dstr_idx), var_tensor(out_dstr_idx), x, cur_count_);
+                });
+            }
+        });
+    }
+    template <typename XDistributedTensor_>
+    CK_TILE_DEVICE static auto MakeMeanVarBlockTile()
+    {
+        static_assert(std::is_same_v<XDataType, typename XDistributedTensor_::DataType>, "wrong!");
+        constexpr auto reduce_dims = sequence<1>{};
+        constexpr auto dstr =
+            make_static_tile_distribution(detail::make_reduce_tile_distribution_encoding(
+                XDistributedTensor_::get_tile_distribution()
+                    .get_static_tile_distribution_encoding(),
+                reduce_dims));
+        auto tensor = make_static_distributed_tensor<ComputeDataType>(dstr);
+        return tensor;
+    }
+    template <typename XDistributedTensor_>
+    CK_TILE_DEVICE auto
+    operator()(const XDistributedTensor_& x_tensor, int& cur_count_, const int& max_count_)
+    {
+        auto mean_tensor = MakeMeanVarBlockTile<XDistributedTensor_>();
+        auto var_tensor  = MakeMeanVarBlockTile<XDistributedTensor_>();
+        clear_tile(mean_tensor);
+        clear_tile(var_tensor);
+        (*this)(x_tensor, mean_tensor, var_tensor, cur_count_, max_count_);
+        return ck_tile::make_tuple(mean_tensor, var_tensor);
+    }
+};
+template <typename Problem_, typename Policy_ = void>
+struct BlockWelfordSync
+{
+    using Problem = remove_cvref_t<Problem_>;
+    template <typename MeanDistributedTensor_, typename VarDistributedTensor_>
+    CK_TILE_DEVICE void
+    operator()(MeanDistributedTensor_& mean_tensor, VarDistributedTensor_& var_tensor, int& count)
+    {
+        using Dstr             = typename MeanDistributedTensor_::StaticTileDistribution;
+        using DstrEncode       = typename Dstr::DstrEncode;
+        using DstrEncodeDetail = typename DstrEncode::detail;
+        static_assert(std::is_same_v<Dstr, typename VarDistributedTensor_::StaticTileDistribution>,
+                      "wrong!");
+        constexpr index_t NDimP = Dstr::get_num_of_dimension_p();
+        constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
+        constexpr index_t idim_p_lane = NDimP - 1;
+        // const auto ps_idx = make_array<index_t>(get_warp_id(), get_lane_id());
+        // const auto rs_idx =
+        //     mean_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
+        constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size();
+        static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size());
+        const int original_count = count;
+        // loop over thread data
+        static_for<0, thread_buf_size, 1>{}([&](auto i) {
+            auto v_local_mean  = mean_tensor.get_thread_buffer()[i];
+            auto v_local_var   = var_tensor.get_thread_buffer()[i];
+            auto v_local_count = original_count;
+            // cross-lane reduce for replication
+            // only reduce on R dimension correspond to lane
+            // (lane id maps to this R dimension)
+            static_for<0, NDimR, 1>{}([&](auto idim_r) {
+                // FIXME: nasty to use does_p_own_r_
+                if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r])
+                {
+                    constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
+                    constexpr index_t lid_over_rid_derivative =
+                        DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r];
+                    static_assert(is_power_of_two_integer(r_length),
+                                  "wrong! only support power of 2 reduction");
+                    constexpr index_t nstage = integer_log2_floor(r_length);
+                    // reduction sweep forward
+                    static_for<0, nstage, 1>{}([&](auto istage) {
+                        // xor
+                        index_t src_lane =
+                            (__lane_id()) ^
+                            (number<lid_over_rid_derivative << istage.value>{}.value);
+                        // pull data from remote lane
+                        const auto v_remote_mean  = warp_shuffle(v_local_mean, src_lane);
+                        const auto v_remote_var   = warp_shuffle(v_local_var, src_lane);
+                        const auto v_remote_count = warp_shuffle(v_local_count, src_lane);
+                        // welford merge
+                        welford_merge(v_local_mean,
+                                      v_local_var,
+                                      v_local_count,
+                                      v_remote_mean,
+                                      v_remote_var,
+                                      v_remote_count);
+                    });
+                }
+            });
+            mean_tensor.get_thread_buffer()(i) = v_local_mean;
+            var_tensor.get_thread_buffer()(i)  = v_local_var;
+            count = v_local_count;
+        });
+    }
+};
+template <typename Problem_, typename Policy_ = void>
+struct BlockWelfordCrossWarpSync
+{
+    using Problem    = remove_cvref_t<Problem_>;
+    using BlockShape = typename Problem::BlockShape;
+    template <typename MeanDistributedTensor_>
+    CK_TILE_DEVICE static constexpr index_t GetReduceWarps()
+    {
+        constexpr index_t num_reduce_warps = [&]() {
+            using Dstr             = typename MeanDistributedTensor_::StaticTileDistribution;
+            using DstrEncode       = typename Dstr::DstrEncode;
+            using DstrEncodeDetail = typename DstrEncode::detail;
+            constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
+            constexpr index_t idim_p_warp = 0;
+            index_t len_ = 1;
+            static_for<0, NDimR, 1>{}([&](auto idim_r) {
+                if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_warp][idim_r])
+                {
+                    constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
+                    len_ *= r_length;
+                }
+            });
+            return len_;
+        }();
+        return num_reduce_warps;
+    }
+    // return in byte
+    template <typename MeanDistributedTensor_>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        // constexpr auto num_reduce_warps = GetReduceWarps<MeanDistributedTensor_>();
+        // data need to exchange is very small, we just pack mean+var+count -> 4dword
+        constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size();
+        // we need to store all data from every wave into smem
+        // e.g. 2x2 reduce along N
+        //     -------------> reduce N
+        //    | w0 | w1 |   ___>      | w01 |
+        //    | w2 | w3 |             | w23 |
+        //
+        //   -> store data from every wave into LDS
+        //
+        //
+        //     -------------> reduce N
+        //    | w0 | w1 | w2 | w3 |   ----->  | w0123 |
+        //
+        //   -> also store data from every wave into LDS
+        constexpr index_t num_warps = BlockShape::BlockSize / warpSize;
+        return num_warps * 4 * thread_buf_size * sizeof(float);
+    }
+    template <typename MeanDistributedTensor_, typename VarDistributedTensor_>
+    CK_TILE_DEVICE void operator()(MeanDistributedTensor_& mean_tensor,
+                                   VarDistributedTensor_& var_tensor,
+                                   int& count,
+                                   void* smem)
+    {
+        using DataType = typename MeanDistributedTensor_::DataType;
+        using Dstr     = typename MeanDistributedTensor_::StaticTileDistribution;
+        // using DstrEncode       = typename Dstr::DstrEncode;
+        // using DstrEncodeDetail = typename DstrEncode::detail;
+        static_assert(std::is_same_v<Dstr, typename VarDistributedTensor_::StaticTileDistribution>,
+                      "wrong!");
+        constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size();
+        static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size());
+        // Note: we always pack everything into fp32x4
+        fp32x4_t* smem_ptr              = reinterpret_cast<fp32x4_t*>(smem);
+        const index_t lane_id           = get_lane_id();
+        const index_t warp_id           = get_warp_id();
+        constexpr auto num_reduce_warps = GetReduceWarps<MeanDistributedTensor_>();
+        constexpr index_t num_warps     = BlockShape::BlockSize / warpSize;
+        const index_t smem_offset       = warp_id;
+        // skip if nonthing to do
+        if constexpr(num_reduce_warps == 1)
+            return;
+        // store into smem only for lane-0 within one warp
+        if(lane_id == 0)
+        {
+            static_for<0, thread_buf_size, 1>{}([&](auto i) {
+                fp32x4_t local_scratch_;
+                local_scratch_[0] = bit_cast<float>(mean_tensor.get_thread_buffer()[i]);
+                local_scratch_[1] = bit_cast<float>(var_tensor.get_thread_buffer()[i]);
+                local_scratch_[2] = bit_cast<float>(count);
+                smem_ptr[smem_offset + i * num_warps] = local_scratch_;
+            });
+        }
+        block_sync_lds();
+        // load from smem. here we let everythread to do compute :)
+        index_t local_warp_id = warp_id / num_reduce_warps;
+        index_t local_smem_os = local_warp_id * num_reduce_warps;
+        fp32x4_t all_scratch[thread_buf_size * num_reduce_warps];
+        static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
+            static_for<0, num_reduce_warps, 1>{}([&](auto i_1) {
+                all_scratch[i_0 * num_reduce_warps + i_1] =
+                    smem_ptr[i_0 * num_warps + local_smem_os + i_1];
+            });
+        });
+        block_sync_lds(); // TODO: we don't need sync here
+        // const int original_count = count;
+        static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
+            // TODO: use descriptor for this
+            auto v_local       = all_scratch[i_0 * num_reduce_warps];
+            auto v_local_mean  = bit_cast<DataType>(v_local[0]);
+            auto v_local_var   = bit_cast<DataType>(v_local[1]);
+            auto v_local_count = bit_cast<int>(v_local[2]);
+            // further reduce mean/var
+            static_for<0, num_reduce_warps - 1, 1>{}([&](auto i_1_n1) {
+                constexpr auto i_1        = number<i_1_n1 + 1>{};
+                const fp32x4_t v_remote   = all_scratch[i_0 * num_reduce_warps + i_1];
+                const auto v_remote_mean  = bit_cast<DataType>(v_remote[0]);
+                const auto v_remote_var   = bit_cast<DataType>(v_remote[1]);
+                const auto v_remote_count = bit_cast<int>(v_remote[2]);
+                welford_merge(v_local_mean,
+                              v_local_var,
+                              v_local_count,
+                              v_remote_mean,
+                              v_remote_var,
+                              v_remote_count);
+            });
+            mean_tensor.get_thread_buffer()(i_0) = v_local_mean;
+            var_tensor.get_thread_buffer()(i_0)  = v_local_var;
+            count = v_local_count;
+        });
+    }
+};
+// compute the max count for a last dim reduce
+// everything may have vector/repeat, so the max count could be uneven
+// TODO: specify which dim to compute and proper set the problem
+// TODO: BlockShape we reuse layernorm_fwd_shape :)
+template <typename BlockShape>
+CK_TILE_DEVICE constexpr index_t block_tile_welford_calculate_max_count(int row_size)
+{
+#if 0
+    using S                   = BlockShape;
+    index_t LastloopN         = row_size % S::Block_N == 0 ? S::Block_N : row_size % S::Block_N;
+    constexpr index_t NThread = S::WarpPerBlock_N * S::ThreadPerWarp_N;
+    index_t iNLane            = get_thread_id() % NThread;
+    index_t iN0               = LastloopN / (S::Vector_N * S::ThreadPerWarp_N);
+    index_t iN1               = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) / S::Vector_N;
+    index_t N2                = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) % S::Vector_N;
+    index_t iN3               = iNLane < iN1 ? S::Vector_N : iNLane == iN1 ? N2 : 0;
+    return iN0 * S::Vector_N + iN3;
+#endif
+    using S_                            = BlockShape;
+    constexpr index_t ThreadsPerBlock_N = S_::WarpPerBlock_N * S_::ThreadPerWarp_N;
+    // TODO: we always check vector size, need be evenly devidable by vector-n
+    const index_t element_per_row = row_size / S_::Vector_N;
+    index_t lane_id_n             = get_thread_id() % ThreadsPerBlock_N;
+    index_t cnt = 0;
+    // TODO: Repeat_N can not be too long, otherwise this is not good
+    static_for<0, S_::Repeat_N, 1>{}([&](auto) {
+        index_t _a = lane_id_n < element_per_row ? 1 : 0;
+        cnt += _a;
+        lane_id_n += ThreadsPerBlock_N;
+    });
+    return cnt * S_::Vector_N;
+}
+// Note: this function must be called after all the computation
+template <typename VarDistributedTensor_>
+CK_TILE_DEVICE constexpr void block_tile_welford_post_scale_var(VarDistributedTensor_& var_tensor,
+                                                                int count)
+{
+    using DataType = typename VarDistributedTensor_::DataType;
+    tile_elementwise_inout([&count](auto& x) { x = x / type_convert<DataType>(count); },
+                           var_tensor);
+}
+} // namespace ck_tile
--- a/include/ck_tile/ops/welford/block/block_welford_problem.hpp
+++ b/include/ck_tile/ops/welford/block/block_welford_problem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+template <typename XDataType_, typename ComputeDataType_, typename BlockShape_>
+struct BlockWelfordProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+};
+} // namespace ck_tile