add better unpack uford loops

483ad69a · carlushuang · b76ef72c · 483ad69a · 483ad69a · 483ad69a
Commit 483ad69a authored Sep 16, 2024 by carlushuang
17 changed files
--- a/example/ck_tile/04_topk_softmax/CMakeLists.txt
+++ b/example/ck_tile/04_topk_softmax/CMakeLists.txt
+# add_test_executable(test_topk_softmax topk_softmax.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../example/ck_tile/05_moe/topk_softmax_api.cpp)
+# target_include_directories(test_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../example/ck_tile/05_moe)
+# target_compile_options(test_topk_softmax PRIVATE -v --save-temps -Wno-gnu-line-marker)
+add_executable(tile_example_topk_softmax EXCLUDE_FROM_ALL topk_softmax.cpp topk_softmax_api.cpp)
+target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+target_compile_options(tile_example_topk_softmax PRIVATE -v --save-temps -Wno-gnu-line-marker)
--- a/test/topk_softmax/script/smoke_test.sh
+++ b/test/topk_softmax/script/smoke_test.sh
 #!/bin/sh
-EXE=./build/bin/test_topk_softmax
+EXE=./build/bin/tile_example_topk_softmax
 for pr_i in "fp16" "bf16" ; do
 $EXE -pr_i=$pr_i -t=80 -e=17

--- a/test/topk_softmax/topk_softmax.cpp
+++ b/test/topk_softmax/topk_softmax.cpp
@@ -209,6 +209,35 @@ bool test_topk_softmax(ck_tile::ArgParser args)
    x_dev.ToDevice(x_host.data());
+    {
+        // using sss = ck_tile::sequence<2, 3, 4>;
+        // using pks = ck_tile::sequence<2, 1, 4>;
+        // using ord = ck_tile::sequence<2, 0, 1>;
+        // ck_tile::static_uford<sss, pks, ord>{}(
+        //     [&](auto i_0, auto i_1, auto i_2, auto i_3) {
+        //         i_0.fo_0();
+        //         i_1.fo_1();
+        //         i_2.fo_2();
+        //         i_3.fo_3();
+        //     }
+        // );
+        // constexpr auto uf = ck_tile::static_uford<sss, pks, ord>{};
+        // ck_tile::static_for<0, uf.get_num_of_access(), 1>{}([&](auto i_access){
+        //     uf([&](auto i_0, auto i_1, auto i_2, auto i_3, auto i_4, auto i_5, auto i_6, auto i_7) {
+        //             decltype(i_0)::push_front(i_access).fo_0();
+        //             decltype(i_1)::push_front(i_access).fo_1();
+        //             decltype(i_2)::push_front(i_access).fo_2();
+        //             decltype(i_3)::push_front(i_access).fo_3();
+        //             decltype(i_4)::push_front(i_access).fo_4();
+        //             decltype(i_5)::push_front(i_access).fo_5();
+        //             decltype(i_6)::push_front(i_access).fo_6();
+        //             decltype(i_7)::push_front(i_access).fo_7();
+        //         },
+        //         i_access);
+        // });
+    }
    topk_softmax_trait trait = [&]() {
        topk_softmax_trait t_;
        t_.input_type  = input_prec;

--- a/example/ck_tile/05_moe/topk_softmax_api.cpp
+++ b/example/ck_tile/05_moe/topk_softmax_api.cpp
--- a/example/ck_tile/05_moe/topk_softmax_api.hpp
+++ b/example/ck_tile/05_moe/topk_softmax_api.hpp
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -5,5 +5,6 @@ include_directories(AFTER
 add_subdirectory(01_fmha)
 add_subdirectory(02_layernorm2d)
 add_subdirectory(03_gemm)
+add_subdirectory(04_topk_softmax)
 add_subdirectory(06_permute)
 add_subdirectory(19_elementwise)
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -54,6 +54,7 @@
 #include "ck_tile/core/tensor/update_tile.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/functional_with_tuple.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
 #include "ck_tile/core/utility/magic_div.hpp"
 #include "ck_tile/core/utility/philox_rand.hpp"

--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -1111,4 +1111,126 @@ CK_TILE_HOST_DEVICE constexpr auto generate_array(F&& f, number<N>)
                  typename arithmetic_sequence_gen<0, N, 1>::type{});
 }
+namespace impl {
+template <typename, typename, typename, index_t>
+struct reverse_slice_sequence_impl;
+template <index_t x,
+          index_t... xs,
+          index_t m,
+          index_t... ms,
+          index_t id,
+          index_t... ids,
+          index_t SliceSize>
+struct reverse_slice_sequence_impl<sequence<x, xs...>,
+                                   sequence<m, ms...>,
+                                   sequence<id, ids...>,
+                                   SliceSize>
+{
+    using old_scan =
+        reverse_slice_sequence_impl<sequence<xs...>, sequence<ms...>, sequence<ids...>, SliceSize>;
+    static constexpr auto slice_size = old_scan::remaining_slice_sizes::front().value;
+    static constexpr auto slice_length =
+        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
+    using dim_lengths =
+        typename sequence_merge<sequence<slice_length>, typename old_scan::dim_lengths>::type;
+    using dim_slices =
+        typename sequence_merge<sequence<x / slice_length>, typename old_scan::dim_slices>::type;
+    using remaining_slice_sizes = typename sequence_merge<
+        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>,
+        typename old_scan::remaining_slice_sizes>::type;
+    // the first idx that sliced length not equal to original length
+    static constexpr index_t _flag =
+        slice_length != x && remaining_slice_sizes{}.front().value == 1;
+    static constexpr index_t _split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
+    static constexpr index_t _split_idx =
+        std::conditional_t<_split_flag, number<id>, number<0>>::value;
+    static constexpr index_t split_flag = _split_flag || old_scan::split_flag;
+    static constexpr index_t split_idx  = std::
+        conditional_t<old_scan::split_flag, number<old_scan::split_idx>, number<_split_idx>>::value;
+};
+template <index_t x, index_t m, index_t id, index_t SliceSize>
+struct reverse_slice_sequence_impl<sequence<x>, sequence<m>, sequence<id>, SliceSize>
+{
+    static constexpr auto slice_size = SliceSize;
+    static constexpr auto slice_length =
+        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
+    using dim_lengths = sequence<slice_length>;
+    using dim_slices  = sequence<x / slice_length>;
+    using remaining_slice_sizes =
+        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>;
+    // the first idx that sliced length not equal to original length
+    static constexpr index_t _flag =
+        slice_length != x && remaining_slice_sizes{}.front().value == 1;
+    static constexpr index_t split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
+    static constexpr index_t split_idx =
+        std::conditional_t<split_flag, number<id>, number<0>>::value;
+};
+} // namespace impl
+// clang-format off
+// input a sequence(with optional mask), and the SliceSize : size per slice
+// output the sequence each slice, and number of slices
+//
+// e.g. <2, 1, 4, 2>, 8     -> lengths:<1, 1, 4, 2>    , nums: <2, 1, 1, 1>    : 2 slices  , slice_idx: 0
+//      <4, 2, 4, 1, 2>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 1> : 16 slices , slice_idx: 2
+//      <4, 2, 4, 1, 6>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 3> : 48 slices , slice_idx: 2
+//      <4, 2, 5, 1, 2>, 10 -> lengths:<1, 1, 5, 1, 2> , nums: <4, 2, 1, 1, 1> : 8 slices  , slice_idx: 1
+//
+//      <4, 2, 8>, 64       -> lengths:<4, 2, 8>       , nums: <1, 1, 1>       : 1  slices , slice_idx: 0
+//      <4, 2, 8>, 32       -> lengths:<2, 2, 8>       , nums: <2, 1, 1>       : 2  slices , slice_idx: 0
+//      <4, 2, 8>, 16       -> lengths:<1, 2, 8>       , nums: <4, 1, 1>       : 4  slices , slice_idx: 0
+//      <4, 2, 8>, 8        -> lengths:<1, 1, 8>       , nums: <4, 2, 1>       : 8  slices , slice_idx: 1
+//      <4, 2, 8>, 4        -> lengths:<1, 1, 4>       , nums: <4, 2, 2>       : 16 slices , slice_idx: 2
+//      <4, 2, 8>, 2        -> lengths:<1, 1, 2>       , nums: <4, 2, 4>       : 32 slices , slice_idx: 2
+//      <4, 2, 8>, 1        -> lengths:<1, 1, 1>       , nums: <4, 2, 8>       : 64 slices , slice_idx: 2
+//
+//      <4, 2, 1, 4, 2> / 4 ->
+// mask:<1, 1, 1, 0, 1>,    -> lengths:<1, 2, 1, 4, 2> , nums: <4, 1, 1, 1, 1> : 8 slices  , slice_idx: 0
+//
+// return tuple<slice_lengths, slice_nums, slice_index>, slice_index is at which index will start
+// have split slices (right -> left)
+//  or the first index that sliced length is different from the original length
+// clang-format on
+template <typename Seq,
+          index_t SliceSize,
+          typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
+constexpr auto reverse_slice_sequence(Seq,
+                                      number<SliceSize>,
+                                      Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
+{
+    static_assert(Seq::size() == Mask::size());
+    using sliced_type =
+        impl::reverse_slice_sequence_impl<Seq,
+                                          Mask,
+                                          typename arithmetic_sequence_gen<0, Seq::size(), 1>::type,
+                                          SliceSize>;
+    static_assert(sliced_type::remaining_slice_sizes::front().value == 1,
+                  "can not evenly divide this sequence, please check");
+    return make_tuple(typename sliced_type::dim_lengths{},
+                      typename sliced_type::dim_slices{},
+                      number<sliced_type::split_idx>{});
+}
+template <typename Seq,
+          index_t SliceSize,
+          typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
+constexpr auto slice_sequence(Seq,
+                              number<SliceSize>,
+                              Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
+{
+    constexpr auto r =
+        reverse_slice_sequence(Seq{}.reverse(), number<SliceSize>{}, Mask{}.reverse());
+    return make_tuple(r[number<0>{}].reverse(),
+                      r[number<1>{}].reverse(),
+                      number<Seq::size() - r[number<2>{}] - 1>{});
+}
 } // namespace ck_tile
--- a/include/ck_tile/core/tensor/sweep_tile.hpp
+++ b/include/ck_tile/core/tensor/sweep_tile.hpp
@@ -8,6 +8,7 @@
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/functional_with_tuple.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 namespace ck_tile {
@@ -27,4 +28,18 @@ CK_TILE_DEVICE void sweep_tile_span(TileDistributedSpan_, const F& f)
    });
 }
+// unpacked span, this version support span with unpack(multi-arg) functor
+//
+template <
+    typename TileDistributedSpan_, // tile_distributed_span<...>
+    typename F,                    // signature: F(tile_distributed_index<...>)
+    typename Unpacks = typename uniform_sequence_gen<TileDistributedSpan_::Impl::size(), 1>::type>
+CK_TILE_DEVICE void sweep_tile_uspan(TileDistributedSpan_, const F& f, Unpacks = {})
+{
+    using DstrSpan = remove_cvref_t<TileDistributedSpan_>;
+    static_uford<typename DstrSpan::Impl, Unpacks>{}(
+        [&](auto... dstr_idx_impl) { f(detail::make_tile_distributed_index(dstr_idx_impl)...); });
+}
 } // namespace ck_tile
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -534,113 +534,6 @@ CK_TILE_HOST_DEVICE constexpr auto make_static_tile_distribution(StaticTileDistr
 //***********************************************************************************
 namespace detail {
-template <typename, typename, typename, index_t>
-struct reverse_slice_sequence_impl;
-template <index_t x,
-          index_t... xs,
-          index_t m,
-          index_t... ms,
-          index_t id,
-          index_t... ids,
-          index_t SliceSize>
-struct reverse_slice_sequence_impl<sequence<x, xs...>,
-                                   sequence<m, ms...>,
-                                   sequence<id, ids...>,
-                                   SliceSize>
-{
-    using old_scan =
-        reverse_slice_sequence_impl<sequence<xs...>, sequence<ms...>, sequence<ids...>, SliceSize>;
-    static constexpr auto slice_size = old_scan::remaining_slice_sizes::front().value;
-    static constexpr auto slice_length =
-        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
-    using dim_lengths =
-        typename sequence_merge<sequence<slice_length>, typename old_scan::dim_lengths>::type;
-    using dim_slices =
-        typename sequence_merge<sequence<x / slice_length>, typename old_scan::dim_slices>::type;
-    using remaining_slice_sizes = typename sequence_merge<
-        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>,
-        typename old_scan::remaining_slice_sizes>::type;
-    // the first idx that sliced length not equal to original length
-    static constexpr index_t _flag =
-        slice_length != x && remaining_slice_sizes{}.front().value == 1;
-    static constexpr index_t _split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
-    static constexpr index_t _split_idx =
-        std::conditional_t<_split_flag, number<id>, number<0>>::value;
-    static constexpr index_t split_flag = _split_flag || old_scan::split_flag;
-    static constexpr index_t split_idx  = std::
-        conditional_t<old_scan::split_flag, number<old_scan::split_idx>, number<_split_idx>>::value;
-};
-template <index_t x, index_t m, index_t id, index_t SliceSize>
-struct reverse_slice_sequence_impl<sequence<x>, sequence<m>, sequence<id>, SliceSize>
-{
-    static constexpr auto slice_size = SliceSize;
-    static constexpr auto slice_length =
-        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
-    using dim_lengths = sequence<slice_length>;
-    using dim_slices  = sequence<x / slice_length>;
-    using remaining_slice_sizes =
-        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>;
-    // the first idx that sliced length not equal to original length
-    static constexpr index_t _flag =
-        slice_length != x && remaining_slice_sizes{}.front().value == 1;
-    static constexpr index_t split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
-    static constexpr index_t split_idx =
-        std::conditional_t<split_flag, number<id>, number<0>>::value;
-};
-// clang-format off
-// input a sequence(with optional mask), and the SliceSize : size per slice
-// output the sequence each slice, and number of slices
-//
-// e.g. <2, 1, 4, 2>, 8     -> lengths:<1, 1, 4, 2>    , nums: <2, 1, 1, 1>    : 2 slices  , slice_idx: 0
-//      <4, 2, 4, 1, 2>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 1> : 16 slices , slice_idx: 2
-//      <4, 2, 4, 1, 6>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 3> : 48 slices , slice_idx: 2
-//      <4, 2, 5, 1, 2>, 10 -> lengths:<1, 1, 5, 1, 2> , nums: <4, 2, 1, 1, 1> : 8 slices  , slice_idx: 1
-//
-//      <4, 2, 8>, 64       -> lengths:<4, 2, 8>       , nums: <1, 1, 1>       : 1  slices , slice_idx: 0
-//      <4, 2, 8>, 32       -> lengths:<2, 2, 8>       , nums: <2, 1, 1>       : 2  slices , slice_idx: 0
-//      <4, 2, 8>, 16       -> lengths:<1, 2, 8>       , nums: <4, 1, 1>       : 4  slices , slice_idx: 0
-//      <4, 2, 8>, 8        -> lengths:<1, 1, 8>       , nums: <4, 2, 1>       : 8  slices , slice_idx: 1
-//      <4, 2, 8>, 4        -> lengths:<1, 1, 4>       , nums: <4, 2, 2>       : 16 slices , slice_idx: 2
-//      <4, 2, 8>, 2        -> lengths:<1, 1, 2>       , nums: <4, 2, 4>       : 32 slices , slice_idx: 2
-//      <4, 2, 8>, 1        -> lengths:<1, 1, 1>       , nums: <4, 2, 8>       : 64 slices , slice_idx: 2
-//
-//      <4, 2, 1, 4, 2> / 4 ->
-// mask:<1, 1, 1, 0, 1>,    -> lengths:<1, 2, 1, 4, 2> , nums: <4, 1, 1, 1, 1> : 8 slices  , slice_idx: 0
-//
-// return tuple<slice_lengths, slice_nums, slice_index>, slice_index is at which index will start
-// have split slices (right -> left)
-//  or the first index that sliced length is different from the original length
-// clang-format on
-template <typename Seq,
-          index_t SliceSize,
-          typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
-constexpr auto reverse_slice_sequence(Seq,
-                                      number<SliceSize>,
-                                      Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
-{
-    static_assert(Seq::size() == Mask::size());
-    using sliced_type =
-        reverse_slice_sequence_impl<Seq,
-                                    Mask,
-                                    typename arithmetic_sequence_gen<0, Seq::size(), 1>::type,
-                                    SliceSize>;
-    static_assert(sliced_type::remaining_slice_sizes::front().value == 1,
-                  "can not evenly divide this sequence, please check");
-    return make_tuple(typename sliced_type::dim_lengths{},
-                      typename sliced_type::dim_slices{},
-                      number<sliced_type::split_idx>{});
-}
 //
 // slice tensor from x_dim, result in split in y_dim, not p_dim.
 // We don't support slice cross p_dim (aka, slice different threads)

--- a/include/ck_tile/core/utility/functional_with_tuple.hpp
+++ b/include/ck_tile/core/utility/functional_with_tuple.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+// This file should not be included inside tuple.hpp!
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+#include <stdint.h>
+#include <utility>
+namespace ck_tile {
+namespace detail {
+// RemainLengths: sequence<...>
+// Orders: sequence<...>
+template <class RemainLengths, class RamainUnpacks, class Orders>
+struct static_uford_impl
+{
+    CK_TILE_HOST_DEVICE constexpr static_uford_impl()
+    {
+        static_assert(RemainLengths::size() > 0, "wrong! should not get here");
+        static_assert(RamainUnpacks::size() > 0, "wrong! should not get here");
+    }
+    template <class F, class CurrentUnpackIds>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, CurrentUnpackIds) const
+    {
+        constexpr index_t pack_len = RamainUnpacks::front();
+        static_for<0, RemainLengths::front(), pack_len>{}([=](auto I) {
+            constexpr auto new_pack = generate_tuple(
+                [&](auto idx_) {
+                    constexpr auto i_new_pack = number<I + idx_ % pack_len>{};
+                    constexpr auto i_pre_pack = number<idx_ / pack_len>{};
+                    return CurrentUnpackIds{}.at(i_pre_pack).push_back(i_new_pack);
+                },
+                number<CurrentUnpackIds::size() * pack_len>{});
+            static_uford_impl<decltype(RemainLengths::pop_front()),
+                              decltype(RamainUnpacks::pop_front()),
+                              Orders>{}(f, new_pack);
+        });
+    }
+};
+template <class Orders>
+struct static_uford_impl<sequence<>, sequence<>, Orders>
+{
+    template <class F, class PackedId>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, PackedId) const
+    {
+        constexpr auto origin_packs = transform_tuples(
+            [](auto pack_) { return decltype(pack_)::reorder_old_to_new(Orders{}); }, PackedId{});
+        unpack(f, origin_packs);
+    }
+};
+template <class RemainLengths, class RamainUnpacks, class Orders>
+struct static_uford_one_shot_impl
+{
+    template <class F, class CurrentUnpackIds, index_t current_acc>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, CurrentUnpackIds, number<current_acc>) const
+    {
+        constexpr auto r_lens_stride =
+            reverse_exclusive_scan_sequence(RemainLengths{}, multiplies{}, number<1>{});
+        constexpr auto r_upks_stride =
+            reverse_exclusive_scan_sequence(RamainUnpacks{}, multiplies{}, number<1>{});
+        constexpr index_t current_stride = r_lens_stride.front() / r_upks_stride.front();
+        constexpr index_t pack_len       = RamainUnpacks::front();
+        constexpr index_t current_idx    = (current_acc / current_stride) * pack_len;
+        constexpr auto new_pack = generate_tuple(
+            [&](auto idx_) {
+                constexpr auto i_new_pack = number<current_idx + idx_ % pack_len>{};
+                constexpr auto i_pre_pack = number<idx_ / pack_len>{};
+                return CurrentUnpackIds{}.at(i_pre_pack).push_back(i_new_pack);
+            },
+            number<CurrentUnpackIds::size() * pack_len>{});
+        static_uford_one_shot_impl<decltype(RemainLengths::pop_front()),
+                                   decltype(RamainUnpacks::pop_front()),
+                                   Orders>{}(f, new_pack, number<current_acc % current_stride>{});
+    }
+};
+template <class Orders>
+struct static_uford_one_shot_impl<sequence<>, sequence<>, Orders>
+{
+    template <class F, class PackedId, index_t current_acc>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, PackedId, number<current_acc>) const
+    {
+        constexpr auto origin_packs = transform_tuples(
+            [](auto pack_) { return decltype(pack_)::reorder_old_to_new(Orders{}); }, PackedId{});
+        unpack(f, origin_packs);
+    }
+};
+} // namespace detail
+// loop over nd space(sequence) with packs
+// you must make sure the function passed in has same number of argument
+//
+// e.g.
+// Lengths=seq<2, 3, 4>, Unpacks=<1, 1, 2>
+// static_uford<Lengths, Unpacks>{}([&](auto i_0, auto i_1){}); // require 2 args(packs)
+//
+// loop #0, i_0=seq<0, 0, 0>, i_1=<0, 0, 1>
+// loop #1, i_0=seq<0, 0, 2>, i_1=<0, 0, 3>
+// loop #2, i_0=seq<0, 1, 0>, i_1=<0, 1, 1>
+// loop #3, i_0=seq<0, 1, 2>, i_1=<0, 1, 3>
+// loop #4, i_0=seq<0, 2, 0>, i_1=<0, 2, 1>
+// loop #5, i_0=seq<0, 2, 2>, i_1=<0, 2, 3>
+// loop #6, i_0=seq<1, 0, 0>, i_1=<1, 0, 1>
+// ...
+template <class Lengths,
+          class Unpacks = typename uniform_sequence_gen<Lengths::size(), 1>::type,
+          class Orders  = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type>
+struct static_uford
+{
+    static constexpr index_t num_packs = reduce_on_sequence(Unpacks{}, multiplies{}, number<1>{});
+    CK_TILE_HOST_DEVICE constexpr static_uford()
+    {
+        static_assert(Lengths::size() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::size() == Unpacks::size(), "wrong! inconsistent size");
+        static_assert(Lengths::size() == Orders::size(), "wrong! inconsistent size");
+        static_for<0, Lengths::size(), 1>{}(
+            [&](auto i) { static_assert(Lengths{}.at(i) % Unpacks{}.at(i) == 0); });
+    }
+    CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_access()
+    {
+        using L_ = decltype(Lengths{} / Unpacks{});
+        return reduce_on_sequence(L_{}, multiplies{}, number<1>{});
+    }
+    // F signature: F(sequence<...> multi_id...)
+    // multi_id is the unordered multi-index
+    template <class F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f) const
+    {
+        constexpr auto ordered_lengths = Lengths::reorder_new_to_old(Orders{});
+        constexpr auto ordered_unpacks = Unpacks::reorder_new_to_old(Orders{});
+        detail::static_uford_impl<decltype(ordered_lengths), decltype(ordered_unpacks), Orders>{}(
+            f, make_tuple(sequence<>{}));
+    }
+    // this version is friendly for issue function one by one
+    template <class F, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, number<i_access>) const
+    {
+        static_assert(i_access < get_num_of_access());
+        constexpr auto ordered_lengths = Lengths::reorder_new_to_old(Orders{});
+        constexpr auto ordered_unpacks = Unpacks::reorder_new_to_old(Orders{});
+        detail::static_uford_one_shot_impl<decltype(ordered_lengths),
+                                           decltype(ordered_unpacks),
+                                           Orders>{}(
+            f, make_tuple(sequence<>{}), number<i_access>{});
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -1079,8 +1079,9 @@ struct ConvScaleRelu
    float scale_out_;
 };
-template<typename DstType, typename SrcType>
+template <typename DstType, typename SrcType>
-struct Cast {
+struct Cast
+{
    template <typename T>
    CK_TILE_HOST_DEVICE void operator()(DstType& y, const SrcType& x) const
    {

--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -295,4 +295,106 @@ CK_TILE_DEVICE auto block_tile_reduce(const InDistributedTensor_& in_tensor,
    return acc_tensor;
 }
+// this version only support 2D->1D reduce (reduce-dim=seq<0, 1>)
+// this version only support in/acc/out datatypes are the same
+// this version will call thread/warp+sync in one function call
+//
+template <typename InDistributedTensor_>
+struct BlockReduce2D
+{
+    using InDistributedTensor = remove_cvref_t<InDistributedTensor_>;
+    using InDataType          = typename InDistributedTensor::DataType;
+    CK_TILE_HOST_DEVICE BlockReduce2D(const InDistributedTensor& t_, const InDataType& reduce_init_)
+        : t(t_), reduce_init(reduce_init_)
+    {
+    }
+    CK_TILE_HOST_DEVICE constexpr auto make_out_distributed_tensor() const
+    {
+        using ReduceDim = sequence<1>; // hard coded
+        constexpr auto acc_dstr =
+            make_static_tile_distribution(ck_tile::detail::make_reduce_tile_distribution_encoding(
+                InDistributedTensor::get_tile_distribution()
+                    .get_static_tile_distribution_encoding(),
+                ReduceDim{}));
+        return make_static_distributed_tensor<InDataType>(acc_dstr);
+    }
+    // return number of pixels each lane need to reduce
+    CK_TILE_HOST_DEVICE constexpr auto get_reduce_length_y() const
+    {
+        constexpr auto spans = InDistributedTensor::get_distributed_spans();
+    }
+    // Here ReducePacksPerXDim is not the same meaning as that in static_uford/sweep_tile_uspan
+    // this is number of packs along the X-dim. We need to compute the Unpacks along the Y dim
+    // internally
+    // For simplicity, we just support along the row dimension, ReducePacksPerXDim is always 2
+    // element , and the first element is always ignored For simplicity, will always try from
+    // right-to-left to find alone which Y dim to split
+    template <typename ReduceFunc,
+              typename ReduceSyncFunc,
+              typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
+    CK_TILE_HOST_DEVICE auto operator()(const ReduceFunc& reduce_func,
+                                        const ReduceSyncFunc& reduce_sync_func,
+                                        ReducePacksPerXDim = {}) const
+    {
+        constexpr auto spans = InDistributedTensor::get_distributed_spans();
+        constexpr auto row_y_unpacks = [&]() {
+            constexpr auto row_y_lengths = typename decltype(spans[number<1>{}])::Impl{};
+            constexpr auto row_y_size =
+                reduce_on_sequence(row_y_lengths, multiplies{}, number<1>{});
+            constexpr auto row_y_packs = ReducePacksPerXDim{}.at(number<1>{});
+            static_assert(row_y_size % row_y_packs == 0);
+            constexpr auto row_y_slice_size = row_y_size / row_y_packs;
+            constexpr auto slice_info = slice_sequence(row_y_lengths, number<row_y_slice_size>{});
+            constexpr auto unpacks    = slice_info[number<1>{}];
+            return unpacks;
+        }();
+        auto acc_tensor = make_out_distributed_tensor();
+        // in-thread reduction
+        // FIXME: hard coded to be 2D to 1D reduction
+        sweep_tile_span(spans[number<0>{}], [&](auto dstr_idx_i0) {
+            constexpr auto acc_dstr_idx = make_tuple(dstr_idx_i0);
+            auto acc = acc_tensor[acc_dstr_idx];
+            sweep_tile_uspan(
+                spans[number<1>{}],
+                [&](auto... dstr_idx_i1) {
+                    acc = reduce_func(acc, t[make_tuple(dstr_idx_i0, dstr_idx_i1)]...);
+                },
+                row_y_unpacks);
+            acc_tensor(acc_dstr_idx) = acc;
+        });
+        // TODO: always use xor to do cross-lane reduce
+        block_tile_reduce_xor_sync(acc_tensor, reduce_sync_func);
+        return acc_tensor;
+    }
+    template <typename ReduceFunc>
+    CK_TILE_HOST_DEVICE auto operator()(const ReduceFunc& reduce_func) const
+    {
+        return operator()(reduce_func, reduce_func);
+    }
+    InDistributedTensor t;
+    InDataType reduce_init;
+};
+// deduction guide
+template <typename T>
+CK_TILE_HOST_DEVICE_EXTERN BlockReduce2D(const T&, const typename T::DataType&)->BlockReduce2D<T>;
 } // namespace ck_tile
--- a/include/ck_tile/ops/softmax/block/block_softmax_2d.hpp
+++ b/include/ck_tile/ops/softmax/block/block_softmax_2d.hpp
@@ -6,6 +6,8 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/reduce.hpp"
+#define _BLOCK_SOFTMAX_USE_UNPACK2 0
 namespace ck_tile {
 /*
@@ -26,15 +28,23 @@ struct BlockSoftmax2D
    CK_TILE_DEVICE void
    operator()(const DistributedTensor& x, DistributedTensor& y, number<dim> = {})
    {
-        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_max  = [](auto e0, auto e1) { return max(e0, e1); };
-        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+        const auto f_sum  = [](auto e0, auto e1) { return e0 + e1; };
+#if _BLOCK_SOFTMAX_USE_UNPACK2
+        const auto f_max3 = [](auto e0, auto e1, auto e2) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, %2, %3" : "=v"(rtn) : "v"(e0), "v"(e1), "v"(e2));
+            return rtn;};
+        const auto f_sum3 = [](auto e0, auto e1, auto e2) { return e0 + e1 + e2; };
+#endif
        // compute row max
-        auto row_max =
+        auto reduce_row_max = BlockReduce2D{x, -numeric<DataType>::infinity()};
-            block_tile_reduce<DataType>(x, sequence<dim>{}, f_max, -numeric<DataType>::infinity());
+#if _BLOCK_SOFTMAX_USE_UNPACK2
+        auto row_max        = reduce_row_max(f_max3, f_max, sequence<1, 2>{});
-        block_tile_reduce_xor_sync(row_max, f_max);
+#else
+        auto row_max        = reduce_row_max(f_max);
+#endif
        // compute elementwise softmax
        constexpr auto span_2d = DistributedTensor::get_distributed_spans();
@@ -47,9 +57,12 @@ struct BlockSoftmax2D
        });
        // compute row sum
-        auto row_sum = block_tile_reduce<DataType>(y, sequence<dim>{}, f_sum, DataType{0});
+        auto reduce_row_sum = BlockReduce2D<decltype(y)>{y, DataType{0}};
-        block_tile_reduce_xor_sync(row_sum, f_sum);
+#if _BLOCK_SOFTMAX_USE_UNPACK2
+        auto row_sum        = reduce_row_sum(f_sum3, f_sum, sequence<1, 2>{});
+#else
+        auto row_sum        = reduce_row_sum(f_sum);
+#endif
        // reciprocal
        auto r = make_static_distributed_tensor<DataType>(row_sum.get_tile_distribution());
        constexpr auto span_1d = decltype(r)::get_distributed_spans();

--- a/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
+++ b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
@@ -9,7 +9,7 @@
 #include <type_traits>
 #ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
-#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 1
+#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 0
 #endif
 namespace ck_tile {

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -219,6 +219,5 @@ endif()
 add_subdirectory(position_embedding)
 add_subdirectory(scatter_gather)
 add_subdirectory(topk)
-add_subdirectory(topk_softmax)
 add_subdirectory(tile_reduce)
--- a/test/topk_softmax/CMakeLists.txt
+++ b/test/topk_softmax/CMakeLists.txt
-add_test_executable(test_topk_softmax topk_softmax.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../example/ck_tile/05_moe/topk_softmax_api.cpp)
-target_include_directories(test_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../example/ck_tile/05_moe)
-target_compile_options(test_topk_softmax PRIVATE -v --save-temps -Wno-gnu-line-marker)