update first gemm ok

572865a6 · carlushuang · 9ec4e3f7 · 572865a6 · 572865a6 · 572865a6
Commit 572865a6 authored Nov 14, 2024 by carlushuang
8 changed files
--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -207,8 +207,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
        {(max_num_tokens_padded + block_m - 1) / block_m});
    ck_tile::HostTensor<IndexDataType> num_sorted_tiles_host({1});

-#if 1
-#if 1
+#if 0
+#   if 1
    ck_tile::FillStepRange<ADataType>{-.5f, .5f, 0.01f}(a_host);
    ck_tile::FillStepRange<GDataType>{-.5f, .5f, 0.01f}(g_host);
    ck_tile::FillStepRange<DDataType, false>{.5f, -.5f, -0.01f}(d_host);
@@ -217,7 +217,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::FillStepRange<DScaleDataType>{0.f, 1.f, 0.01f}(sd_host);
    ck_tile::FillStepRange<YSmoothScaleDataType>{0.f, 1.f, 0.01f}(sy_host);
    ck_tile::FillStepRange<TopkWeightDataType>{-.5f, .5f, 0.01f}(topk_weight_host);
-#else
+#   else
    ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
    ck_tile::FillUniformDistribution<GDataType>{-.5f, .5f}(g_host);
    ck_tile::FillUniformDistribution<DDataType>{-.5f, .5f}(d_host);
@@ -226,7 +226,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::FillUniformDistribution<DScaleDataType>{-.5f, .5f}(sd_host);
    ck_tile::FillUniformDistribution<YSmoothScaleDataType>{-.5f, .5f}(sy_host);
    ck_tile::FillUniformDistribution<TopkWeightDataType>{-.5f, .5f}(topk_weight_host);
-#endif
+#   endif

    // permute weight
    ck_tile::HostTensor<GDataType> g_perm_host = shuffle_moe_weight(g_host, prec_w, 1);
@@ -266,6 +266,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::HostTensor<DDataType> d_perm_host = shuffle_moe_weight(d_host, prec_w, 1);
    std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl;

+#   if 0
    ck_tile::reference_moe_sorting<TopkWeightDataType, IndexDataType>(
        topk_ids_host,
        topk_weight_host,
@@ -318,8 +319,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
    }

    return 1;
-#endif
+#   endif

+#endif
+    (void)balance;
    ck_tile::reference_moe_sorting<TopkWeightDataType, IndexDataType>(
        topk_ids_host,
        topk_weight_host,

--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -432,23 +432,34 @@ struct tile_window_linear
    CK_TILE_DEVICE static constexpr index_t get_bottom_linear_offset(number<i_access>)
    {
        constexpr auto linear_coord = get_bottom_linear_coordinate(number<i_access>{});
-        // since this is linear offset, we assum bottom X tensor is always linear
-        constexpr index_t linear_offset = [&]() {
-            constexpr auto x_idx_ = linear_coord;
-            constexpr auto x_len_ = TileDstr{}.get_lengths();
-            static_assert(x_idx_.size() == x_len_.size());
-            constexpr index_t x_dims_ = x_idx_.size();
-            index_t cu_stride_        = 1;
-            index_t cu_offset_        = 0;
-            static_for<0, x_dims_, 1>{}([&](auto i_) {
-                auto r_i_ = number<x_dims_ - i_ - 1>{};
-                cu_offset_ += x_idx_[r_i_] * cu_stride_;
-                cu_stride_ *= x_len_[r_i_];
-            });
-            return cu_offset_;
-        }();
-
-        return linear_offset;
+        constexpr auto is_pure_linear_tensor =  reduce_on_sequence(LinearBottomDims{}, multiplies{}, number<1>{});
+        if constexpr (is_pure_linear_tensor) {
+            // this case usually is a LDS window, everything is build time know.
+            // we directly use BottomTensorView to compute the offset, in case there is any padding
+            auto bottom_tensor_coord = make_tensor_coordinate(
+                BottomTensorView{}.get_tensor_descriptor(), linear_coord);
+            return bottom_tensor_coord.get_offset();
+        } else {
+            // this case usually is a global window, where last dim can be linear
+            // we hack here, that use the original TileDstr to compute the linear offset
+            // ... hoping that there is no extra padding between other dims, which make sense
+            // since that sould introduce runtime length
+            constexpr index_t linear_offset = [&]() {
+                constexpr auto x_idx_ = linear_coord;
+                constexpr auto x_len_ = TileDstr{}.get_lengths();
+                static_assert(x_idx_.size() == x_len_.size());
+                constexpr index_t x_dims_ = x_idx_.size();
+                index_t cu_stride_        = 1;
+                index_t cu_offset_        = 0;
+                static_for<0, x_dims_, 1>{}([&](auto i_) {
+                    auto r_i_ = number<x_dims_ - i_ - 1>{};
+                    cu_offset_ += x_idx_[r_i_] * cu_stride_;
+                    cu_stride_ *= x_len_[r_i_];
+                });
+                return cu_offset_;
+            }();
+            return linear_offset;
+        }
    }

    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return traits::NumAccess; }

--- a/include/ck_tile/host/reference/reference_fused_moe.hpp
+++ b/include/ck_tile/host/reference/reference_fused_moe.hpp
@@ -122,6 +122,7 @@ void reference_fused_moe(
                       type_convert<AccDataType>(g_host(i_expert, i_n, i_k));
            }
            acc_0(0, i_n) = acc;
+            // printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
        }

        ck_tile::HostTensor<AccDataType> y({1, intermediate_size_1});
@@ -134,6 +135,7 @@ void reference_fused_moe(
            for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
            {
                Activation{}(y(0, i_n), acc_0(0, i_n));
+                printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n));
            }
        }
        else
@@ -168,7 +170,8 @@ void reference_fused_moe(
        }
    };

-    make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
+    // make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f, max_num_tokens_padded)(1);

    // reduce
    auto r = [&](auto i_token) {

--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -596,9 +596,9 @@ struct FastGeluAsm
    CK_TILE_DEVICE void operator()<float, float>(float& y, const float& x) const
    {
        // const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
-        const float c1     = 0xbd92220c; // -2.0 * 0.035677f;
-        const float c2     = -2.0 * 0.797885f;
-        const float log2e_ = 0x3fb8aa3b; // log2e_v<float>;
+        const uint32_t c1     = 0xbd92220c; // -2.0 * 0.035677f;
+        const float c2        = -2.0 * 0.797885f;
+        const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v<float>;
        float tmp;

        asm volatile("v_mul_f32 %[v_tmp], %[v_x], %[v_x]        ; x*x\n"
@@ -606,13 +606,63 @@ struct FastGeluAsm
                     "v_mul_f32 %[v_tmp], %[v_tmp], %[v_x]      ; x*(c1*x*x+c2)\n"
                     "v_mul_f32 %[v_tmp], %[v_tmp], %[s_log2e]  ; log2e*x*(c1*x*x+c2)\n"
                     "v_exp_f32 %[v_tmp], %[v_tmp]              ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
+                     "s_nop 0                                   ; hazard for exp\n"
                     "v_add_f32 %[v_tmp], %[v_tmp], 1.0         ; emu+1.0f\n"
                     "v_rcp_f32 %[v_tmp], %[v_tmp]              ; 1/(emu+1.0f)\n"
+                     "s_nop 0                                   ; hazard for rcp \n"
                     "v_mul_f32 %[v_y], %[v_tmp], %[v_x]        ; x * 1/(emu+1f)\n"
                     : [v_y] "=v"(y), [v_tmp] "+v"(tmp)
                     : [v_x] "v"(x), [s_c1] "s"(c1), [v_c2] "v"(c2), [s_log2e] "s"(log2e_)
                     :);
    }
+
+    template <>
+    CK_TILE_HOST void operator()<fp32x2_t, fp32x2_t>(fp32x2_t& y, const fp32x2_t& x) const
+    {
+        // const float u   = -2.f * x * (0.035677f * x * x + 0.797885f);
+        const float c1  = -2.0 * 0.035677f;
+        const float c2  = -2.0 * 0.797885f;
+        const float u0   = x.x * (c1 * x.x * x.x + c2);
+        const float emu0 = exp(u0);
+        y.x              = x.x / (1.f + emu0);
+        const float u1   = x.y * (c1 * x.y * x.y + c2);
+        const float emu1 = exp(u1);
+        y.y              = x.y / (1.f + emu1);
+    }
+
+    // this is packed verion to remove data hazard for trans
+    template <>
+    CK_TILE_DEVICE void operator()<fp32x2_t, fp32x2_t>(fp32x2_t& y, const fp32x2_t& x) const
+    {
+        // const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
+        const uint32_t c1     = 0xbd92220c; // -2.0 * 0.035677f;
+        const float c2        = -2.0 * 0.797885f;
+        const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v<float>;
+        float tmp0, tmp1;
+        float y0, y1;
+
+        asm volatile("v_mul_f32 %[v_tmp0], %[v_x0], %[v_x0]        ; x*x\n"
+                     "v_mul_f32 %[v_tmp1], %[v_x1], %[v_x1]        ; x*x\n"
+                     "v_fma_f32 %[v_tmp0], %[v_tmp0], %[s_c1], %[v_c2]  ; c1*x*x+c2\n"
+                     "v_fma_f32 %[v_tmp1], %[v_tmp1], %[s_c1], %[v_c2]  ; c1*x*x+c2\n"
+                     "v_mul_f32 %[v_tmp0], %[v_tmp0], %[v_x0]      ; x*(c1*x*x+c2)\n"
+                     "v_mul_f32 %[v_tmp1], %[v_tmp1], %[v_x1]      ; x*(c1*x*x+c2)\n"
+                     "v_mul_f32 %[v_tmp0], %[v_tmp0], %[s_log2e]  ; log2e*x*(c1*x*x+c2)\n"
+                     "v_mul_f32 %[v_tmp1], %[v_tmp1], %[s_log2e]  ; log2e*x*(c1*x*x+c2)\n"
+                     "v_exp_f32 %[v_tmp0], %[v_tmp0]              ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
+                     "v_exp_f32 %[v_tmp1], %[v_tmp1]              ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
+                     "v_add_f32 %[v_tmp0], %[v_tmp0], 1.0         ; emu+1.0f\n"
+                     "v_add_f32 %[v_tmp1], %[v_tmp1], 1.0         ; emu+1.0f\n"
+                     "v_rcp_f32 %[v_tmp0], %[v_tmp0]              ; 1/(emu+1.0f)\n"
+                     "v_rcp_f32 %[v_tmp1], %[v_tmp1]              ; 1/(emu+1.0f)\n"
+                     "v_mul_f32 %[v_y0], %[v_tmp0], %[v_x0]        ; x * 1/(emu+1f)\n"
+                     "v_mul_f32 %[v_y1], %[v_tmp1], %[v_x1]        ; x * 1/(emu+1f)\n"
+                     : [v_y0] "=v"(y0), [v_y1] "=v"(y1), [v_tmp0] "+v"(tmp0), [v_tmp1] "+v"(tmp1)
+                     : [v_x0] "v"(x.x), [v_x1] "v"(x.y), [s_c1] "s"(c1), [v_c2] "v"(c2), [s_log2e] "s"(log2e_)
+                     :);
+        y.x = y0;
+        y.y = y1;
+    }
 };

 // https://paperswithcode.com/method/gelu

--- a/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.hpp
@@ -262,168 +262,168 @@ struct FlatmmSnUK_GFX9_32x128x512_1x4x1_16x16x16_BF16
            "  s_waitcnt     vmcnt(32)                               \n"
            "  s_barrier                                             \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  buffer_load_dwordx4  acc[128:131], %[v_os_b0], s[12:15], 0 offen \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  buffer_load_dwordx4  acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  buffer_load_dwordx4  acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  buffer_load_dwordx4  acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], [%[c4], %[c5], %[c6], %[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[16:17], v[128:129], 0 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[18:19], v[130:131], [%[c8],%[c9],%[c10],%[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n"
            "  buffer_load_dwordx4  acc[144:147], %[v_os_b1], s[12:15], 0 offen \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[20:21], v[132:133], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[22:23], v[134:135], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[24:25], v[136:137], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[26:27], v[138:139], [%[c8],%[c9],%[c10],%[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]] \n"
            "  buffer_load_dwordx4  acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[28:29], v[140:141], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[30:31], v[142:143], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[16:17], v[192:193], 0 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[18:19], v[194:195], [%[c12],%[c13],%[c14],%[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n"
            "  buffer_load_dwordx4  acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[20:21], v[196:197], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[22:23], v[198:199], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[24:25], v[200:201], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[26:27], v[202:203], [%[c12],%[c13],%[c14],%[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], %[c14], %[c15]] \n"
            "  buffer_load_dwordx4  acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[28:29], v[204:205], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[30:31], v[206:207], [%[c12],%[c13],%[c14],%[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], [%[c12], %[c13], %[c14], %[c15]] \n"
            "  s_waitcnt     vmcnt(32)                               \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0],%[c1],%[c2],%[c3]], acc[32:33], v[144:145], [%[c0],%[c1],%[c2],%[c3]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0],%[c1],%[c2],%[c3]], acc[34:35], v[146:147], [%[c0],%[c1],%[c2],%[c3]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  buffer_load_dwordx4  acc[160:163], %[v_os_b2], s[12:15], 0 offen \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0],%[c1],%[c2],%[c3]], acc[36:37], v[148:149], [%[c0],%[c1],%[c2],%[c3]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0],%[c1],%[c2],%[c3]], acc[38:39], v[150:151], [%[c0],%[c1],%[c2],%[c3]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0],%[c1],%[c2],%[c3]], acc[40:41], v[152:153], [%[c0],%[c1],%[c2],%[c3]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0],%[c1],%[c2],%[c3]], acc[42:43], v[154:155], [%[c0],%[c1],%[c2],%[c3]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], %[c3]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], [%[c0], %[c1], %[c2], %[c3]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  buffer_load_dwordx4  acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0],%[c1],%[c2],%[c3]], acc[44:45], v[156:157], [%[c0],%[c1],%[c2],%[c3]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0],%[c1],%[c2],%[c3]], acc[46:47], v[158:159], [%[c0],%[c1],%[c2],%[c3]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4],%[c5],%[c6],%[c7]], acc[32:33], v[208:209], [%[c4],%[c5],%[c6],%[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4],%[c5],%[c6],%[c7]], acc[34:35], v[210:211], [%[c4],%[c5],%[c6],%[c7]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], %[c3]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], [%[c0], %[c1], %[c2], %[c3]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  buffer_load_dwordx4  acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4],%[c5],%[c6],%[c7]], acc[36:37], v[212:213], [%[c4],%[c5],%[c6],%[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4],%[c5],%[c6],%[c7]], acc[38:39], v[214:215], [%[c4],%[c5],%[c6],%[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4],%[c5],%[c6],%[c7]], acc[40:41], v[216:217], [%[c4],%[c5],%[c6],%[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4],%[c5],%[c6],%[c7]], acc[42:43], v[218:219], [%[c4],%[c5],%[c6],%[c7]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], %[c7]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], [%[c4], %[c5], %[c6], %[c7]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  buffer_load_dwordx4  acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4],%[c5],%[c6],%[c7]], acc[44:45], v[220:221], [%[c4],%[c5],%[c6],%[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4],%[c5],%[c6],%[c7]], acc[46:47], v[222:223], [%[c4],%[c5],%[c6],%[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[48:49], v[144:145], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[50:51], v[146:147], [%[c8],%[c9],%[c10],%[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], %[c7]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], [%[c4], %[c5], %[c6], %[c7]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]] \n"
            "  buffer_load_dwordx4  acc[176:179], %[v_os_b3], s[12:15], 0 offen \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[52:53], v[148:149], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[54:55], v[150:151], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[56:57], v[152:153], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[58:59], v[154:155], [%[c8],%[c9],%[c10],%[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]] \n"
            "  buffer_load_dwordx4  acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[60:61], v[156:157], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[62:63], v[158:159], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[48:49], v[208:209], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[50:51], v[210:211], [%[c12],%[c13],%[c14],%[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], %[c14], %[c15]] \n"
            "  buffer_load_dwordx4  acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[52:53], v[212:213], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[54:55], v[214:215], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[56:57], v[216:217], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[58:59], v[218:219], [%[c12],%[c13],%[c14],%[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], %[c14], %[c15]] \n"
            "  buffer_load_dwordx4  acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[60:61], v[220:221], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[62:63], v[222:223], [%[c12],%[c13],%[c14],%[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n"
            "  s_waitcnt     vmcnt(32)                               \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  buffer_load_dwordx4  acc[192:195], %[v_os_b4], s[12:15], 0 offen \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  buffer_load_dwordx4  acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  buffer_load_dwordx4  acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  buffer_load_dwordx4  acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], [%[c4], %[c5], %[c6], %[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[80:81], v[160:161], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[82:83], v[162:163], [%[c8],%[c9],%[c10],%[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]] \n"
            "  buffer_load_dwordx4  acc[208:211], %[v_os_b5], s[12:15], 0 offen \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[84:85], v[164:165], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[86:87], v[166:167], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[88:89], v[168:169], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[90:91], v[170:171], [%[c8],%[c9],%[c10],%[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]] \n"
            "  buffer_load_dwordx4  acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[92:93], v[172:173], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[94:95], v[174:175], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[80:81], v[224:225], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[82:83], v[226:227], [%[c12],%[c13],%[c14],%[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], %[c14], %[c15]] \n"
            "  buffer_load_dwordx4  acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[84:85], v[228:229], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[86:87], v[230:231], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[88:89], v[232:233], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[90:91], v[234:235], [%[c12],%[c13],%[c14],%[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], %[c14], %[c15]] \n"
            "  buffer_load_dwordx4  acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[92:93], v[236:237], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[94:95], v[238:239], [%[c12],%[c13],%[c14],%[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  s_waitcnt     vmcnt(32)                               \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  buffer_load_dwordx4  acc[224:227], %[v_os_b6], s[12:15], 0 offen \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  buffer_load_dwordx4  acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  buffer_load_dwordx4  acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  buffer_load_dwordx4  acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], %[c6], %[c7]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[112:113], v[176:177], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[114:115], v[178:179], [%[c8],%[c9],%[c10],%[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]] \n"
            "  buffer_load_dwordx4  acc[240:243], %[v_os_b7], s[12:15], 0 offen \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[116:117], v[180:181], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[118:119], v[182:183], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[120:121], v[184:185], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[122:123], v[186:187], [%[c8],%[c9],%[c10],%[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]] \n"
            "  buffer_load_dwordx4  acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[124:125], v[188:189], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c8],%[c9],%[c10],%[c11]], acc[126:127], v[190:191], [%[c8],%[c9],%[c10],%[c11]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[112:113], v[240:241], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[114:115], v[242:243], [%[c12],%[c13],%[c14],%[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], %[c14], %[c15]] \n"
            "  buffer_load_dwordx4  acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[116:117], v[244:245], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[118:119], v[246:247], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[120:121], v[248:249], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[122:123], v[250:251], [%[c12],%[c13],%[c14],%[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], %[c13], %[c14], %[c15]] \n"
            "  buffer_load_dwordx4  acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[124:125], v[252:253], [%[c12],%[c13],%[c14],%[c15]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c12],%[c13],%[c14],%[c15]], acc[126:127], v[254:255], [%[c12],%[c13],%[c14],%[c15]] \n"
-            
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], %[c13], %[c14], %[c15]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], %[c15]]\n"
            // "  s_add_u32     s60, 0x00000100, s80                    \n"
            // "  s_cmp_lt_u32  s60, s81                                \n"
            // "  s_cselect_b32  s56, s56, 0                            \n"
@@ -561,167 +561,168 @@ struct FlatmmSnUK_GFX9_32x128x512_1x4x1_16x16x16_BF16
            "  s_waitcnt     vmcnt(32)                               \n"
            "  s_barrier                                             \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], [%[c16],%[c17],%[c18],%[c19]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], [%[c20],%[c21],%[c22],%[c23]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], [%[c24],%[c25],%[c26],%[c27]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0 \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], [%[c28],%[c29],%[c30],%[c31]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  s_waitcnt     vmcnt(32)                               \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], [%[c16],%[c17],%[c18],%[c19]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], [%[c16],%[c17],%[c18],%[c19]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], [%[c20],%[c21],%[c22],%[c23]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], [%[c20],%[c21],%[c22],%[c23]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], [%[c24],%[c25],%[c26],%[c27]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], [%[c24],%[c25],%[c26],%[c27]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], [%[c28],%[c29],%[c30],%[c31]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], [%[c28],%[c29],%[c30],%[c31]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  s_waitcnt     vmcnt(32)                               \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], [%[c16],%[c17],%[c18],%[c19]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], [%[c16],%[c17],%[c18],%[c19]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], [%[c20],%[c21],%[c22],%[c23]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], [%[c20],%[c21],%[c22],%[c23]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], [%[c24],%[c25],%[c26],%[c27]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], [%[c24],%[c25],%[c26],%[c27]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], [%[c28],%[c29],%[c30],%[c31]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], [%[c28],%[c29],%[c30],%[c31]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], [%[c28],%[c29],%[c30],%[c31]] \n"
+            "  s_waitcnt     vmcnt(32)                               \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], [%[c16],%[c17],%[c18],%[c19]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], [%[c16],%[c17],%[c18],%[c19]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], [%[c16],%[c17],%[c18],%[c19]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], [%[c20],%[c21],%[c22],%[c23]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], [%[c20],%[c21],%[c22],%[c23]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], [%[c20],%[c21],%[c22],%[c23]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], [%[c24],%[c25],%[c26],%[c27]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], [%[c24],%[c25],%[c26],%[c27]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], [%[c24],%[c25],%[c26],%[c27]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], [%[c28],%[c29],%[c30],%[c31]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], [%[c28],%[c29],%[c30],%[c31]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]] \n"
            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], [%[c28],%[c29],%[c30],%[c31]] \n"
-            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]] \n"
+            "  v_mfma_f32_16x16x16_bf16  [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]\n"
            // "  s_add_u32     s60, 0x00000100, s80                    \n"
            // "  s_cmp_lt_u32  s60, s81                                \n"
            // "  s_cselect_b32  s56, s56, 0                            \n"

--- a/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_uk_gfx9_32x512x128_1x4x1_16x16x16.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_uk_gfx9_32x512x128_1x4x1_16x16x16.hpp
@@ -13,6 +13,26 @@ namespace ck_tile {
 // require 4 wave, occupancy=1c
 // agpr useage:256
 // vgpr usage:64(A local) + 64(acc) + 8(os_a) + 8(os_b) = 144 (rem:112)
+//
+// for this gemm, 4 16x16x16 transposed layout
+//  input A vpgpr layout
+//   v0-v15: [ 0:15](gemm_m)x128(gemm_k)
+//  v16-v31: [16:31](gemm_m)x128(gemm_k)
+
+//  input B vpgpr layout
+//   v0-v15: [  0: 15](gemm_n)x128(gemm_k)
+//  v16-v31: [ 64: 79](gemm_n)x128(gemm_k)
+//  ......................
+//  v111-v127: [448:463](gemm_n)x128(gemm_k)
+
+//  output C vpgpr layout
+//   v0-v3 : [ 0:15](gemm_m)x[ 0: 15](gemm_n)
+//   v4-v7 : [16:31](gemm_m)x[ 0: 15](gemm_n)
+//   v8-v11: [ 0:15](gemm_m)x[64: 79](gemm_n)
+//  v12-v15: [16:31](gemm_m)x[64: 79](gemm_n)
+//  ......................
+//  v56-v59: [ 0:15](gemm_m)x[448:463](gemm_n)
+//  v60-v63: [16:31](gemm_m)x[448:463](gemm_n)
 struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
 {
    static constexpr index_t Block_M = 32;
@@ -42,7 +62,7 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
    static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 8
    static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 8/2=4

-    static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
+    static CK_TILE_DEVICE constexpr auto MakeCBlockDist()
    {
        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
            sequence<>,
@@ -53,11 +73,17 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
            sequence<0, 0>>{};

        using WG        = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution;
-        using CDataType = float;

        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        return c_block_dstr;
+    }
+
+    static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
+    {
+        using CDataType = float;
+        constexpr auto c_block_dstr = MakeCBlockDist();
        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
        return c_block_tensor;
    }
@@ -153,21 +179,8 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
    // template <typename Problem>
    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadDesc_A()
    {
-        // A async->LDS
-        // Note that, this descriptor is only to construct the layout inside LDS
-        // in real Gemm pipeline, ds_read may not follow this pattern
-        // (may follow that in tile_distribution)
-        // below code is almost the same as SmemStore dist, with difference:
-        //  1). modify the GuaranteedLastDimensionVectorLength of naive tensor desc
-        //  2). return discriptor is in NxK 2d layout
-        // constexpr index_t Block_M = Problem::BlockShape::Block_M0;
-        // constexpr index_t Block_K = Problem::BlockShape::Block_K0;
-        // constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
-        constexpr index_t warpSize = ck_tile::get_warp_size();
-        // constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
-
+        // load from LDS to register, every wave has same layout
        constexpr index_t KPack_  = 8;      // GetSmemKPack_A<Problem>(); // LDS
-        constexpr index_t KVector = 2;      // GetAlignment_A<Problem>(); // async copy 1 dword
        constexpr index_t KPad    = KPack_; // pad between warps

        constexpr index_t kAMLane     = 16;
@@ -176,29 +189,12 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
        constexpr index_t kKIter      = 2;
        static_assert(KPack_ == (kABKPerLane * kKIter));

-        static_assert(Block_K % KVector == 0);
-        constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
-        if constexpr(LanesPerK >= warpSize)
-        {
-            // need multiple waves to load K
-            static_assert(LanesPerK % warpSize == 0);
-            constexpr index_t wavesPerK = LanesPerK / warpSize;
-            if constexpr(wavesPerK >= NumWarps)
-            {
-                // TODO: need multiple issues along K to load all data
-            }
-            else
-            {
-                // TODO: every wave load the same data!
-                static_assert(Block_K % (kABKLane * KPack_) == 0);
-                constexpr index_t issue_along_k = Block_K / (kABKLane * KPack_); // 4
-                constexpr index_t issue_along_m = Block_M / (kAMLane);           // 2
-                constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
-                    make_tuple(number<issue_along_m>{},            // m0
-                               number<kAMLane>{},                  // m1
-                               number<issue_along_k>{},            // k0
-                               number<kABKLane>{},                 // k1
-                               number<KPack_>{}),                  // k2
+        constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+                    make_tuple(number<Repeat_M>{},            // m0 y
+                               number<kAMLane>{},             // m1 p
+                               number<Repeat_K>{},            // k0 y
+                               number<kABKLane>{},            // k1 p
+                               number<KPack_>{}),             // k2 y-vector
                    make_tuple(number<kAMLane*(Block_K + KPad)>{}, // m0
                               number<Block_K + KPad>{},           // m1
                               number<kABKLane * KPack_>{},        // k0
@@ -207,21 +203,16 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
                    number<KPack_>{},                              // lds load vector
                    number<1>{});

-                constexpr auto lds_desc_m_k = transform_tensor_descriptor(
-                    lds_block_desc_0,
-                    make_tuple(make_merge_transform(
-                                   make_tuple(number<issue_along_m>{}, number<kAMLane>{})),
-                               make_merge_transform(make_tuple(
-                                   number<issue_along_k>{}, number<kABKLane>{}, number<KPack_>{}))),
-                    make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
-                    make_tuple(sequence<0>{}, sequence<1>{}));
-
-                return lds_desc_m_k;
-            }
-        }
-        else
-        {
-        }
+        constexpr auto lds_desc_m_k = transform_tensor_descriptor(
+            lds_block_desc_0,
+            make_tuple(make_merge_transform(
+                            make_tuple(number<Repeat_M>{}, number<kAMLane>{})),
+                        make_merge_transform(make_tuple(
+                            number<Repeat_K>{}, number<kABKLane>{}, number<KPack_>{}))),
+            make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return lds_desc_m_k;
    }

    static constexpr auto GetGemm_AWarpEnc()
@@ -271,10 +262,10 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
        auto a_sld = [&]() {
            constexpr auto a_warp_enc_      = GetGemm_AWarpEnc();
            constexpr auto a_outer_dstr_enc = tile_distribution_encoding<
-                sequence<>,
+                sequence<WarpPerBlock_N>,
                tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_K>>,
-                tuple<sequence<1>>,
-                tuple<sequence<1>>,
+                tuple<sequence<1, 0>>,
+                tuple<sequence<1, 0>>,
                sequence<1, 2>,
                sequence<0, 0>>{};
            constexpr auto a_block_dstr_encode =
@@ -300,6 +291,12 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
            },
            number<a_sld.get_num_of_access()>{});

+
+        printf("----- tid:%d, a_sld:%d\n", static_cast<index_t>(threadIdx.x),
+                        static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset()));
+
+
+
        index_t loop_cnt = k / Block_K;

        // this is the acc thread buffer

--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
@@ -586,6 +586,47 @@ struct FusedMoeGemmPipelineFlatmmPolicy
        return desc;
    }

+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsStoreForUKDesc()
+    {
+        constexpr index_t WarpPerBlock_N = Problem::BlockShape::WarpPerBlock_N0;
+        constexpr index_t Repeat_N = Problem::BlockShape::Repeat_N0;
+        constexpr index_t Repeat_M = Problem::BlockShape::Repeat_M0;
+
+        constexpr index_t kAMLane     = 16;
+        constexpr index_t kABKLane    = 4;
+        constexpr index_t kABKPerLane = 4;
+
+        constexpr index_t KPack       = kABKPerLane;
+
+        constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+                    make_tuple(number<Repeat_M>{},                  // m
+                               number<Repeat_N>{},                  // n
+                               number<WarpPerBlock_N>{},            // n
+                               number<kABKLane>{},                  // n
+                               number<kAMLane>{},                   // m
+                               number<KPack>{}),                    // n
+                    make_tuple(number<Repeat_N * WarpPerBlock_N * kABKLane * kAMLane * KPack>{},  //  m
+                               number<WarpPerBlock_N * kABKLane * kAMLane * KPack>{},   //  n
+                               number<kABKLane * kAMLane * KPack>{},                    //  n
+                               number<kAMLane * KPack>{},                               //  n
+                               number<KPack>{},                                         //  m
+                               number<1>{}),                                            //  n
+                    number<KPack>{},                // lds store vector(actually no explicit store)
+                    number<1>{});
+
+        constexpr auto desc = transform_tensor_descriptor(
+                lds_block_desc_0,
+                make_tuple(
+                    make_merge_transform(make_tuple(number<Repeat_M>{}, number<kAMLane>{})),
+                    make_merge_transform(make_tuple(number<Repeat_N>{}, number<WarpPerBlock_N>{}, number<kABKLane>{}, number<KPack>{}))
+                ),
+                make_tuple(sequence<0, 4>{}, sequence<1, 2, 3, 5>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return desc;
+    }
+
    template <typename Problem>
    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemm0()
    {

--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
@@ -340,12 +340,15 @@ struct FusedMoeGemmPipeline_FlatmmUk
                           number<row_ids_a.size()>{});

        auto bridge_sst_win = [&]() {
-            return make_tile_window(
+            constexpr auto desc_ = Policy::template MakeBridgeLdsStoreForUKDesc<Problem>();
+            constexpr auto dist_ = Policy::template GetUK_0<Problem>().MakeCBlockDist();
+            return make_tile_window_linear(
                make_tensor_view<address_space_enum::lds>(
                    reinterpret_cast<YDataType*>(smem),
-                    Policy::template MakeBridgeLdsStoreDesc<Problem>()),
-                Policy::template MakeBridgeLdsStoreDesc<Problem>().get_lengths(),
-                {0, 0});
+                    desc_),
+                desc_.get_lengths(),
+                {0, 0},
+                dist_);
        }();
        auto o_res =
            make_wave_buffer_resource(reinterpret_cast<const ODataType*>(kargs.o_ptr),
@@ -439,8 +442,56 @@ struct FusedMoeGemmPipeline_FlatmmUk
                              BlockShape::Block_W0); // tile offset for B matrix each unroll

        // return ;
+        //sweep_tile(acc_0,
+        //           [&](auto idx) { typename Problem::GateActivation{}(acc_0(idx), acc_0[idx]); });
        sweep_tile(acc_0,
-                   [&](auto idx) { typename Problem::GateActivation{}(acc_0(idx), acc_0[idx]); });
+                   [&](auto idx0, auto idx1) {
+                        fp32x2_t v_ {acc_0(idx0), acc_0(idx1)};
+                        typename Problem::GateActivation{}(v_, v_);
+                        acc_0(idx0) = v_.x;
+                        acc_0(idx1) = v_.y;
+                    },
+                    sequence<1, 2>{});
+
+#if 0
+        printf("bid:%d,%d, tid:%d, sorted_tile_id:%d(, intermediate_tile_id:%d, e:%d, "
+               "interm_idx_nr:%d, coords:a:%d,%d,%d, row_ids_a:%d,%d,%d, (%d)g_coords:%d.%d.%d, bridge_sst_win:%d"
+               "acc:%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f\n",
+               static_cast<int>(blockIdx.x),
+               static_cast<int>(blockIdx.y),
+               static_cast<int>(threadIdx.x),
+               sorted_tile_id,
+               intermediate_tile_id,
+               expert_id,
+               interm_idx_nr,
+               row_coords_a[0],
+               row_coords_a[1],
+               row_coords_a[7],
+               row_ids_a[0],
+               row_ids_a[1],
+               row_ids_a[7],
+               kr_0 * BlockShape::Block_W0,
+               g_coords[number<0>{}],
+               g_coords[number<1>{}],
+               g_coords[number<7>{}],
+               bridge_sst_win.cached_coords_[number<0>{}].get_offset(),
+                acc_0.get_thread_buffer()[number<0>{}],
+                acc_0.get_thread_buffer()[number<1>{}],
+                acc_0.get_thread_buffer()[number<2>{}],
+                acc_0.get_thread_buffer()[number<3>{}],
+                acc_0.get_thread_buffer()[number<4>{}],
+                acc_0.get_thread_buffer()[number<5>{}],
+                acc_0.get_thread_buffer()[number<6>{}],
+                acc_0.get_thread_buffer()[number<7>{}],
+                acc_0.get_thread_buffer()[number<8 + 0>{}],
+                acc_0.get_thread_buffer()[number<8 + 1>{}],
+                acc_0.get_thread_buffer()[number<8 + 2>{}],
+                acc_0.get_thread_buffer()[number<8 + 3>{}],
+                acc_0.get_thread_buffer()[number<8 + 4>{}],
+                acc_0.get_thread_buffer()[number<8 + 5>{}],
+                acc_0.get_thread_buffer()[number<8 + 6>{}],
+                acc_0.get_thread_buffer()[number<8 + 7>{}]);
+#endif

        auto y_pre = cast_tile<YDataType>(acc_0);
        store_tile(bridge_sst_win, y_pre);