update first gemm ok

572865a6 · carlushuang · 9ec4e3f7 · 572865a6 · 572865a6 · 572865a6
Commit 572865a6 authored Nov 14, 2024 by carlushuang
8 changed files
--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -207,8 +207,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
        {(max_num_tokens_padded + block_m - 1) / block_m});
    ck_tile::HostTensor<IndexDataType> num_sorted_tiles_host({1});

-#if 1
-#if 1
+#if 0
+#   if 1
    ck_tile::FillStepRange<ADataType>{-.5f, .5f, 0.01f}(a_host);
    ck_tile::FillStepRange<GDataType>{-.5f, .5f, 0.01f}(g_host);
    ck_tile::FillStepRange<DDataType, false>{.5f, -.5f, -0.01f}(d_host);
@@ -217,7 +217,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::FillStepRange<DScaleDataType>{0.f, 1.f, 0.01f}(sd_host);
    ck_tile::FillStepRange<YSmoothScaleDataType>{0.f, 1.f, 0.01f}(sy_host);
    ck_tile::FillStepRange<TopkWeightDataType>{-.5f, .5f, 0.01f}(topk_weight_host);
-#else
+#   else
    ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
    ck_tile::FillUniformDistribution<GDataType>{-.5f, .5f}(g_host);
    ck_tile::FillUniformDistribution<DDataType>{-.5f, .5f}(d_host);
@@ -226,7 +226,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::FillUniformDistribution<DScaleDataType>{-.5f, .5f}(sd_host);
    ck_tile::FillUniformDistribution<YSmoothScaleDataType>{-.5f, .5f}(sy_host);
    ck_tile::FillUniformDistribution<TopkWeightDataType>{-.5f, .5f}(topk_weight_host);
-#endif
+#   endif

    // permute weight
    ck_tile::HostTensor<GDataType> g_perm_host = shuffle_moe_weight(g_host, prec_w, 1);
@@ -266,6 +266,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::HostTensor<DDataType> d_perm_host = shuffle_moe_weight(d_host, prec_w, 1);
    std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl;

+#   if 0
    ck_tile::reference_moe_sorting<TopkWeightDataType, IndexDataType>(
        topk_ids_host,
        topk_weight_host,
@@ -318,8 +319,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
    }

    return 1;
-#endif
+#   endif

+#endif
+    (void)balance;
    ck_tile::reference_moe_sorting<TopkWeightDataType, IndexDataType>(
        topk_ids_host,
        topk_weight_host,

--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -432,7 +432,18 @@ struct tile_window_linear
    CK_TILE_DEVICE static constexpr index_t get_bottom_linear_offset(number<i_access>)
    {
        constexpr auto linear_coord = get_bottom_linear_coordinate(number<i_access>{});
-        // since this is linear offset, we assum bottom X tensor is always linear
+        constexpr auto is_pure_linear_tensor =  reduce_on_sequence(LinearBottomDims{}, multiplies{}, number<1>{});
+        if constexpr (is_pure_linear_tensor) {
+            // this case usually is a LDS window, everything is build time know.
+            // we directly use BottomTensorView to compute the offset, in case there is any padding
+            auto bottom_tensor_coord = make_tensor_coordinate(
+                BottomTensorView{}.get_tensor_descriptor(), linear_coord);
+            return bottom_tensor_coord.get_offset();
+        } else {
+            // this case usually is a global window, where last dim can be linear
+            // we hack here, that use the original TileDstr to compute the linear offset
+            // ... hoping that there is no extra padding between other dims, which make sense
+            // since that sould introduce runtime length
            constexpr index_t linear_offset = [&]() {
                constexpr auto x_idx_ = linear_coord;
                constexpr auto x_len_ = TileDstr{}.get_lengths();
@@ -447,9 +458,9 @@ struct tile_window_linear
                });
                return cu_offset_;
            }();
-
            return linear_offset;
        }
+    }

    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return traits::NumAccess; }


--- a/include/ck_tile/host/reference/reference_fused_moe.hpp
+++ b/include/ck_tile/host/reference/reference_fused_moe.hpp
@@ -122,6 +122,7 @@ void reference_fused_moe(
                       type_convert<AccDataType>(g_host(i_expert, i_n, i_k));
            }
            acc_0(0, i_n) = acc;
+            // printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
        }

        ck_tile::HostTensor<AccDataType> y({1, intermediate_size_1});
@@ -134,6 +135,7 @@ void reference_fused_moe(
            for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
            {
                Activation{}(y(0, i_n), acc_0(0, i_n));
+                printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n));
            }
        }
        else
@@ -168,7 +170,8 @@ void reference_fused_moe(
        }
    };

-    make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
+    // make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f, max_num_tokens_padded)(1);

    // reduce
    auto r = [&](auto i_token) {

--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -596,9 +596,9 @@ struct FastGeluAsm
    CK_TILE_DEVICE void operator()<float, float>(float& y, const float& x) const
    {
        // const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
-        const float c1     = 0xbd92220c; // -2.0 * 0.035677f;
+        const uint32_t c1     = 0xbd92220c; // -2.0 * 0.035677f;
        const float c2        = -2.0 * 0.797885f;
-        const float log2e_ = 0x3fb8aa3b; // log2e_v<float>;
+        const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v<float>;
        float tmp;

        asm volatile("v_mul_f32 %[v_tmp], %[v_x], %[v_x]        ; x*x\n"
@@ -606,13 +606,63 @@ struct FastGeluAsm
                     "v_mul_f32 %[v_tmp], %[v_tmp], %[v_x]      ; x*(c1*x*x+c2)\n"
                     "v_mul_f32 %[v_tmp], %[v_tmp], %[s_log2e]  ; log2e*x*(c1*x*x+c2)\n"
                     "v_exp_f32 %[v_tmp], %[v_tmp]              ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
+                     "s_nop 0                                   ; hazard for exp\n"
                     "v_add_f32 %[v_tmp], %[v_tmp], 1.0         ; emu+1.0f\n"
                     "v_rcp_f32 %[v_tmp], %[v_tmp]              ; 1/(emu+1.0f)\n"
+                     "s_nop 0                                   ; hazard for rcp \n"
                     "v_mul_f32 %[v_y], %[v_tmp], %[v_x]        ; x * 1/(emu+1f)\n"
                     : [v_y] "=v"(y), [v_tmp] "+v"(tmp)
                     : [v_x] "v"(x), [s_c1] "s"(c1), [v_c2] "v"(c2), [s_log2e] "s"(log2e_)
                     :);
    }
+
+    template <>
+    CK_TILE_HOST void operator()<fp32x2_t, fp32x2_t>(fp32x2_t& y, const fp32x2_t& x) const
+    {
+        // const float u   = -2.f * x * (0.035677f * x * x + 0.797885f);
+        const float c1  = -2.0 * 0.035677f;
+        const float c2  = -2.0 * 0.797885f;
+        const float u0   = x.x * (c1 * x.x * x.x + c2);
+        const float emu0 = exp(u0);
+        y.x              = x.x / (1.f + emu0);
+        const float u1   = x.y * (c1 * x.y * x.y + c2);
+        const float emu1 = exp(u1);
+        y.y              = x.y / (1.f + emu1);
+    }
+
+    // this is packed verion to remove data hazard for trans
+    template <>
+    CK_TILE_DEVICE void operator()<fp32x2_t, fp32x2_t>(fp32x2_t& y, const fp32x2_t& x) const
+    {
+        // const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
+        const uint32_t c1     = 0xbd92220c; // -2.0 * 0.035677f;
+        const float c2        = -2.0 * 0.797885f;
+        const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v<float>;
+        float tmp0, tmp1;
+        float y0, y1;
+
+        asm volatile("v_mul_f32 %[v_tmp0], %[v_x0], %[v_x0]        ; x*x\n"
+                     "v_mul_f32 %[v_tmp1], %[v_x1], %[v_x1]        ; x*x\n"
+                     "v_fma_f32 %[v_tmp0], %[v_tmp0], %[s_c1], %[v_c2]  ; c1*x*x+c2\n"
+                     "v_fma_f32 %[v_tmp1], %[v_tmp1], %[s_c1], %[v_c2]  ; c1*x*x+c2\n"
+                     "v_mul_f32 %[v_tmp0], %[v_tmp0], %[v_x0]      ; x*(c1*x*x+c2)\n"
+                     "v_mul_f32 %[v_tmp1], %[v_tmp1], %[v_x1]      ; x*(c1*x*x+c2)\n"
+                     "v_mul_f32 %[v_tmp0], %[v_tmp0], %[s_log2e]  ; log2e*x*(c1*x*x+c2)\n"
+                     "v_mul_f32 %[v_tmp1], %[v_tmp1], %[s_log2e]  ; log2e*x*(c1*x*x+c2)\n"
+                     "v_exp_f32 %[v_tmp0], %[v_tmp0]              ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
+                     "v_exp_f32 %[v_tmp1], %[v_tmp1]              ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
+                     "v_add_f32 %[v_tmp0], %[v_tmp0], 1.0         ; emu+1.0f\n"
+                     "v_add_f32 %[v_tmp1], %[v_tmp1], 1.0         ; emu+1.0f\n"
+                     "v_rcp_f32 %[v_tmp0], %[v_tmp0]              ; 1/(emu+1.0f)\n"
+                     "v_rcp_f32 %[v_tmp1], %[v_tmp1]              ; 1/(emu+1.0f)\n"
+                     "v_mul_f32 %[v_y0], %[v_tmp0], %[v_x0]        ; x * 1/(emu+1f)\n"
+                     "v_mul_f32 %[v_y1], %[v_tmp1], %[v_x1]        ; x * 1/(emu+1f)\n"
+                     : [v_y0] "=v"(y0), [v_y1] "=v"(y1), [v_tmp0] "+v"(tmp0), [v_tmp1] "+v"(tmp1)
+                     : [v_x0] "v"(x.x), [v_x1] "v"(x.y), [s_c1] "s"(c1), [v_c2] "v"(c2), [s_log2e] "s"(log2e_)
+                     :);
+        y.x = y0;
+        y.y = y1;
+    }
 };

 // https://paperswithcode.com/method/gelu

--- a/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.hpp
--- a/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_uk_gfx9_32x512x128_1x4x1_16x16x16.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_uk_gfx9_32x512x128_1x4x1_16x16x16.hpp
@@ -13,6 +13,26 @@ namespace ck_tile {
 // require 4 wave, occupancy=1c
 // agpr useage:256
 // vgpr usage:64(A local) + 64(acc) + 8(os_a) + 8(os_b) = 144 (rem:112)
+//
+// for this gemm, 4 16x16x16 transposed layout
+//  input A vpgpr layout
+//   v0-v15: [ 0:15](gemm_m)x128(gemm_k)
+//  v16-v31: [16:31](gemm_m)x128(gemm_k)
+
+//  input B vpgpr layout
+//   v0-v15: [  0: 15](gemm_n)x128(gemm_k)
+//  v16-v31: [ 64: 79](gemm_n)x128(gemm_k)
+//  ......................
+//  v111-v127: [448:463](gemm_n)x128(gemm_k)
+
+//  output C vpgpr layout
+//   v0-v3 : [ 0:15](gemm_m)x[ 0: 15](gemm_n)
+//   v4-v7 : [16:31](gemm_m)x[ 0: 15](gemm_n)
+//   v8-v11: [ 0:15](gemm_m)x[64: 79](gemm_n)
+//  v12-v15: [16:31](gemm_m)x[64: 79](gemm_n)
+//  ......................
+//  v56-v59: [ 0:15](gemm_m)x[448:463](gemm_n)
+//  v60-v63: [16:31](gemm_m)x[448:463](gemm_n)
 struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
 {
    static constexpr index_t Block_M = 32;
@@ -42,7 +62,7 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
    static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 8
    static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 8/2=4

-    static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
+    static CK_TILE_DEVICE constexpr auto MakeCBlockDist()
    {
        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
            sequence<>,
@@ -53,11 +73,17 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
            sequence<0, 0>>{};

        using WG        = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution;
-        using CDataType = float;

        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        return c_block_dstr;
+    }
+
+    static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
+    {
+        using CDataType = float;
+        constexpr auto c_block_dstr = MakeCBlockDist();
        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
        return c_block_tensor;
    }
@@ -153,21 +179,8 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
    // template <typename Problem>
    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadDesc_A()
    {
-        // A async->LDS
-        // Note that, this descriptor is only to construct the layout inside LDS
-        // in real Gemm pipeline, ds_read may not follow this pattern
-        // (may follow that in tile_distribution)
-        // below code is almost the same as SmemStore dist, with difference:
-        //  1). modify the GuaranteedLastDimensionVectorLength of naive tensor desc
-        //  2). return discriptor is in NxK 2d layout
-        // constexpr index_t Block_M = Problem::BlockShape::Block_M0;
-        // constexpr index_t Block_K = Problem::BlockShape::Block_K0;
-        // constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
-        constexpr index_t warpSize = ck_tile::get_warp_size();
-        // constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
-
+        // load from LDS to register, every wave has same layout
        constexpr index_t KPack_  = 8;      // GetSmemKPack_A<Problem>(); // LDS
-        constexpr index_t KVector = 2;      // GetAlignment_A<Problem>(); // async copy 1 dword
        constexpr index_t KPad    = KPack_; // pad between warps

        constexpr index_t kAMLane     = 16;
@@ -176,29 +189,12 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
        constexpr index_t kKIter      = 2;
        static_assert(KPack_ == (kABKPerLane * kKIter));

-        static_assert(Block_K % KVector == 0);
-        constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
-        if constexpr(LanesPerK >= warpSize)
-        {
-            // need multiple waves to load K
-            static_assert(LanesPerK % warpSize == 0);
-            constexpr index_t wavesPerK = LanesPerK / warpSize;
-            if constexpr(wavesPerK >= NumWarps)
-            {
-                // TODO: need multiple issues along K to load all data
-            }
-            else
-            {
-                // TODO: every wave load the same data!
-                static_assert(Block_K % (kABKLane * KPack_) == 0);
-                constexpr index_t issue_along_k = Block_K / (kABKLane * KPack_); // 4
-                constexpr index_t issue_along_m = Block_M / (kAMLane);           // 2
        constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
-                    make_tuple(number<issue_along_m>{},            // m0
-                               number<kAMLane>{},                  // m1
-                               number<issue_along_k>{},            // k0
-                               number<kABKLane>{},                 // k1
-                               number<KPack_>{}),                  // k2
+                    make_tuple(number<Repeat_M>{},            // m0 y
+                               number<kAMLane>{},             // m1 p
+                               number<Repeat_K>{},            // k0 y
+                               number<kABKLane>{},            // k1 p
+                               number<KPack_>{}),             // k2 y-vector
                    make_tuple(number<kAMLane*(Block_K + KPad)>{}, // m0
                               number<Block_K + KPad>{},           // m1
                               number<kABKLane * KPack_>{},        // k0
@@ -210,19 +206,14 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
        constexpr auto lds_desc_m_k = transform_tensor_descriptor(
            lds_block_desc_0,
            make_tuple(make_merge_transform(
-                                   make_tuple(number<issue_along_m>{}, number<kAMLane>{})),
+                            make_tuple(number<Repeat_M>{}, number<kAMLane>{})),
                        make_merge_transform(make_tuple(
-                                   number<issue_along_k>{}, number<kABKLane>{}, number<KPack_>{}))),
+                            number<Repeat_K>{}, number<kABKLane>{}, number<KPack_>{}))),
            make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
            make_tuple(sequence<0>{}, sequence<1>{}));

        return lds_desc_m_k;
    }
-        }
-        else
-        {
-        }
-    }

    static constexpr auto GetGemm_AWarpEnc()
    {
@@ -271,10 +262,10 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
        auto a_sld = [&]() {
            constexpr auto a_warp_enc_      = GetGemm_AWarpEnc();
            constexpr auto a_outer_dstr_enc = tile_distribution_encoding<
-                sequence<>,
+                sequence<WarpPerBlock_N>,
                tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_K>>,
-                tuple<sequence<1>>,
-                tuple<sequence<1>>,
+                tuple<sequence<1, 0>>,
+                tuple<sequence<1, 0>>,
                sequence<1, 2>,
                sequence<0, 0>>{};
            constexpr auto a_block_dstr_encode =
@@ -300,6 +291,12 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
            },
            number<a_sld.get_num_of_access()>{});

+
+        printf("----- tid:%d, a_sld:%d\n", static_cast<index_t>(threadIdx.x),
+                        static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset()));
+
+
+
        index_t loop_cnt = k / Block_K;

        // this is the acc thread buffer

--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
@@ -586,6 +586,47 @@ struct FusedMoeGemmPipelineFlatmmPolicy
        return desc;
    }

+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsStoreForUKDesc()
+    {
+        constexpr index_t WarpPerBlock_N = Problem::BlockShape::WarpPerBlock_N0;
+        constexpr index_t Repeat_N = Problem::BlockShape::Repeat_N0;
+        constexpr index_t Repeat_M = Problem::BlockShape::Repeat_M0;
+
+        constexpr index_t kAMLane     = 16;
+        constexpr index_t kABKLane    = 4;
+        constexpr index_t kABKPerLane = 4;
+
+        constexpr index_t KPack       = kABKPerLane;
+
+        constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+                    make_tuple(number<Repeat_M>{},                  // m
+                               number<Repeat_N>{},                  // n
+                               number<WarpPerBlock_N>{},            // n
+                               number<kABKLane>{},                  // n
+                               number<kAMLane>{},                   // m
+                               number<KPack>{}),                    // n
+                    make_tuple(number<Repeat_N * WarpPerBlock_N * kABKLane * kAMLane * KPack>{},  //  m
+                               number<WarpPerBlock_N * kABKLane * kAMLane * KPack>{},   //  n
+                               number<kABKLane * kAMLane * KPack>{},                    //  n
+                               number<kAMLane * KPack>{},                               //  n
+                               number<KPack>{},                                         //  m
+                               number<1>{}),                                            //  n
+                    number<KPack>{},                // lds store vector(actually no explicit store)
+                    number<1>{});
+
+        constexpr auto desc = transform_tensor_descriptor(
+                lds_block_desc_0,
+                make_tuple(
+                    make_merge_transform(make_tuple(number<Repeat_M>{}, number<kAMLane>{})),
+                    make_merge_transform(make_tuple(number<Repeat_N>{}, number<WarpPerBlock_N>{}, number<kABKLane>{}, number<KPack>{}))
+                ),
+                make_tuple(sequence<0, 4>{}, sequence<1, 2, 3, 5>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return desc;
+    }
+
    template <typename Problem>
    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemm0()
    {

--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
@@ -340,12 +340,15 @@ struct FusedMoeGemmPipeline_FlatmmUk
                           number<row_ids_a.size()>{});

        auto bridge_sst_win = [&]() {
-            return make_tile_window(
+            constexpr auto desc_ = Policy::template MakeBridgeLdsStoreForUKDesc<Problem>();
+            constexpr auto dist_ = Policy::template GetUK_0<Problem>().MakeCBlockDist();
+            return make_tile_window_linear(
                make_tensor_view<address_space_enum::lds>(
                    reinterpret_cast<YDataType*>(smem),
-                    Policy::template MakeBridgeLdsStoreDesc<Problem>()),
-                Policy::template MakeBridgeLdsStoreDesc<Problem>().get_lengths(),
-                {0, 0});
+                    desc_),
+                desc_.get_lengths(),
+                {0, 0},
+                dist_);
        }();
        auto o_res =
            make_wave_buffer_resource(reinterpret_cast<const ODataType*>(kargs.o_ptr),
@@ -439,8 +442,56 @@ struct FusedMoeGemmPipeline_FlatmmUk
                              BlockShape::Block_W0); // tile offset for B matrix each unroll

        // return ;
+        //sweep_tile(acc_0,
+        //           [&](auto idx) { typename Problem::GateActivation{}(acc_0(idx), acc_0[idx]); });
        sweep_tile(acc_0,
-                   [&](auto idx) { typename Problem::GateActivation{}(acc_0(idx), acc_0[idx]); });
+                   [&](auto idx0, auto idx1) {
+                        fp32x2_t v_ {acc_0(idx0), acc_0(idx1)};
+                        typename Problem::GateActivation{}(v_, v_);
+                        acc_0(idx0) = v_.x;
+                        acc_0(idx1) = v_.y;
+                    },
+                    sequence<1, 2>{});
+
+#if 0
+        printf("bid:%d,%d, tid:%d, sorted_tile_id:%d(, intermediate_tile_id:%d, e:%d, "
+               "interm_idx_nr:%d, coords:a:%d,%d,%d, row_ids_a:%d,%d,%d, (%d)g_coords:%d.%d.%d, bridge_sst_win:%d"
+               "acc:%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f\n",
+               static_cast<int>(blockIdx.x),
+               static_cast<int>(blockIdx.y),
+               static_cast<int>(threadIdx.x),
+               sorted_tile_id,
+               intermediate_tile_id,
+               expert_id,
+               interm_idx_nr,
+               row_coords_a[0],
+               row_coords_a[1],
+               row_coords_a[7],
+               row_ids_a[0],
+               row_ids_a[1],
+               row_ids_a[7],
+               kr_0 * BlockShape::Block_W0,
+               g_coords[number<0>{}],
+               g_coords[number<1>{}],
+               g_coords[number<7>{}],
+               bridge_sst_win.cached_coords_[number<0>{}].get_offset(),
+                acc_0.get_thread_buffer()[number<0>{}],
+                acc_0.get_thread_buffer()[number<1>{}],
+                acc_0.get_thread_buffer()[number<2>{}],
+                acc_0.get_thread_buffer()[number<3>{}],
+                acc_0.get_thread_buffer()[number<4>{}],
+                acc_0.get_thread_buffer()[number<5>{}],
+                acc_0.get_thread_buffer()[number<6>{}],
+                acc_0.get_thread_buffer()[number<7>{}],
+                acc_0.get_thread_buffer()[number<8 + 0>{}],
+                acc_0.get_thread_buffer()[number<8 + 1>{}],
+                acc_0.get_thread_buffer()[number<8 + 2>{}],
+                acc_0.get_thread_buffer()[number<8 + 3>{}],
+                acc_0.get_thread_buffer()[number<8 + 4>{}],
+                acc_0.get_thread_buffer()[number<8 + 5>{}],
+                acc_0.get_thread_buffer()[number<8 + 6>{}],
+                acc_0.get_thread_buffer()[number<8 + 7>{}]);
+#endif

        auto y_pre = cast_tile<YDataType>(acc_0);
        store_tile(bridge_sst_win, y_pre);