Commit 26d84960 authored by shengnxu's avatar shengnxu
Browse files

change some inline parameter style

parent d0c80b12
...@@ -384,22 +384,70 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -384,22 +384,70 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
#include "uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc" #include "uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc"
#undef CK_TILE_FLATMM_UK_MFMA #undef CK_TILE_FLATMM_UK_MFMA
: [s_loop_cnt]"+s"(loop_cnt), : [s_loop_cnt]"+s"(loop_cnt),
// [v_acc_0]"+v"(v_acc[0]), [c0]"+v"(v_z0),
// [v_acc_1]"+v"(v_acc[1]), [c1]"+v"(v_z1),
// [v_acc_2]"+v"(v_acc[2]), [c2]"+v"(v_z2),
// [v_acc_3]"+v"(v_acc[3]), [c3]"+v"(v_z3),
// [v_acc_4]"+v"(v_acc[4]), [c4]"+v"(v_z4),
// [v_acc_5]"+v"(v_acc[5]), [c5]"+v"(v_z5),
// [v_acc_6]"+v"(v_acc[6]), [c6]"+v"(v_z6),
// [v_acc_7]"+v"(v_acc[7]), [c7]"+v"(v_z7),
// [v_acc_8]"+v"(v_acc[8]), [c8]"+v"(v_z8),
// [v_acc_9]"+v"(v_acc[9]), [c9]"+v"(v_z9),
// [v_acc_10]"+v"(v_acc[10]), [c10]"+v"(v_z10),
// [v_acc_11]"+v"(v_acc[11]), [c11]"+v"(v_z11),
// [v_acc_12]"+v"(v_acc[12]), [c12]"+v"(v_z12),
// [v_acc_13]"+v"(v_acc[13]), [c13]"+v"(v_z13),
// [v_acc_14]"+v"(v_acc[14]), [c14]"+v"(v_z14),
// [v_acc_15]"+v"(v_acc[15]), [c15]"+v"(v_z15),
[c16]"+v"(v_z16),
[c17]"+v"(v_z17),
[c18]"+v"(v_z18),
[c19]"+v"(v_z19),
[c20]"+v"(v_z20),
[c21]"+v"(v_z21),
[c22]"+v"(v_z22),
[c23]"+v"(v_z23),
[c24]"+v"(v_z24),
[c25]"+v"(v_z25),
[c26]"+v"(v_z26),
[c27]"+v"(v_z27),
[c28]"+v"(v_z28),
[c29]"+v"(v_z29),
[c30]"+v"(v_z30),
[c31]"+v"(v_z31),
[c32]"+v"(v_z32),
[c33]"+v"(v_z33),
[c34]"+v"(v_z34),
[c35]"+v"(v_z35),
[c36]"+v"(v_z36),
[c37]"+v"(v_z37),
[c38]"+v"(v_z38),
[c39]"+v"(v_z39),
[c40]"+v"(v_z40),
[c41]"+v"(v_z41),
[c42]"+v"(v_z42),
[c43]"+v"(v_z43),
[c44]"+v"(v_z44),
[c45]"+v"(v_z45),
[c46]"+v"(v_z46),
[c47]"+v"(v_z47),
[c48]"+v"(v_z48),
[c49]"+v"(v_z49),
[c50]"+v"(v_z50),
[c51]"+v"(v_z51),
[c52]"+v"(v_z52),
[c53]"+v"(v_z53),
[c54]"+v"(v_z54),
[c55]"+v"(v_z55),
[c56]"+v"(v_z56),
[c57]"+v"(v_z57),
[c58]"+v"(v_z58),
[c59]"+v"(v_z59),
[c60]"+v"(v_z60),
[c61]"+v"(v_z61),
[c62]"+v"(v_z62),
[c63]"+v"(v_z63),
[v_token_id0]"+v"(temp0), [v_token_id0]"+v"(temp0),
[v_token_id1]"+v"(temp1), [v_token_id1]"+v"(temp1),
[s_mem_]"+r"(smem) [s_mem_]"+r"(smem)
...@@ -533,81 +581,74 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -533,81 +581,74 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
); );
// clang-format on // clang-format on
#pragma clang diagnostic pop #pragma clang diagnostic pop
int32x4_t v_acc[16]{0};
v_acc[0][0] = v_z0;
v_acc[0][1] = v_z1;
v_acc[0][2] = v_z2;
v_acc[0][3] = v_z3;
v_acc[1][0] = v_z4;
v_acc[1][1] = v_z5;
v_acc[1][2] = v_z6;
v_acc[1][3] = v_z7;
v_acc[2][0] = v_z8;
v_acc[2][1] = v_z9;
v_acc[2][2] = v_z10;
v_acc[2][3] = v_z11;
v_acc[3][0] = v_z12;
v_acc[3][1] = v_z13;
v_acc[3][2] = v_z14;
v_acc[3][3] = v_z15;
v_acc[4][0] = v_z16;
v_acc[4][1] = v_z17;
v_acc[4][2] = v_z18;
v_acc[4][3] = v_z19;
v_acc[5][0] = v_z20;
v_acc[5][1] = v_z21;
v_acc[5][2] = v_z22;
v_acc[5][3] = v_z23;
v_acc[6][0] = v_z24;
v_acc[6][1] = v_z25;
v_acc[6][2] = v_z26;
v_acc[6][3] = v_z27;
v_acc[7][0] = v_z28;
v_acc[7][1] = v_z29;
v_acc[7][2] = v_z30;
v_acc[7][3] = v_z31;
v_acc[8][0] = v_z32;
v_acc[8][1] = v_z33;
v_acc[8][2] = v_z34;
v_acc[8][3] = v_z35;
v_acc[9][0] = v_z36;
v_acc[9][1] = v_z37;
v_acc[9][2] = v_z38;
v_acc[9][3] = v_z39;
v_acc[10][0] = v_z40;
v_acc[10][1] = v_z41;
v_acc[10][2] = v_z42;
v_acc[10][3] = v_z43;
v_acc[11][0] = v_z44;
v_acc[11][1] = v_z45;
v_acc[11][2] = v_z46;
v_acc[11][3] = v_z47;
v_acc[12][0] = v_z48;
v_acc[12][1] = v_z49;
v_acc[12][2] = v_z50;
v_acc[12][3] = v_z51;
v_acc[13][0] = v_z52;
v_acc[13][1] = v_z53;
v_acc[13][2] = v_z54;
v_acc[13][3] = v_z55;
v_acc[14][0] = v_z56;
v_acc[14][1] = v_z57;
v_acc[14][2] = v_z58;
v_acc[14][3] = v_z59;
v_acc[15][0] = v_z60;
v_acc[15][1] = v_z61;
v_acc[15][2] = v_z62;
v_acc[15][3] = v_z63;
// return local scratch // return local scratch
auto c = MakeCBlockTile(); auto c = MakeCBlockTile();
for(auto i = 0; i < 16; i++) c.get_thread_buffer()[0] = v_z0;
{ c.get_thread_buffer()[1] = v_z1;
c.get_thread_buffer()[4 * i + 0] = v_acc[i].x; c.get_thread_buffer()[2] = v_z2;
c.get_thread_buffer()[4 * i + 1] = v_acc[i].y; c.get_thread_buffer()[3] = v_z3;
c.get_thread_buffer()[4 * i + 2] = v_acc[i].z; c.get_thread_buffer()[4] = v_z4;
c.get_thread_buffer()[4 * i + 3] = v_acc[i].w; c.get_thread_buffer()[5] = v_z5;
} c.get_thread_buffer()[6] = v_z6;
c.get_thread_buffer()[7] = v_z7;
c.get_thread_buffer()[8] = v_z8;
c.get_thread_buffer()[9] = v_z9;
c.get_thread_buffer()[10] = v_z10;
c.get_thread_buffer()[11] = v_z11;
c.get_thread_buffer()[12] = v_z12;
c.get_thread_buffer()[13] = v_z13;
c.get_thread_buffer()[14] = v_z14;
c.get_thread_buffer()[15] = v_z15;
c.get_thread_buffer()[16] = v_z16;
c.get_thread_buffer()[17] = v_z17;
c.get_thread_buffer()[18] = v_z18;
c.get_thread_buffer()[19] = v_z19;
c.get_thread_buffer()[20] = v_z20;
c.get_thread_buffer()[21] = v_z21;
c.get_thread_buffer()[22] = v_z22;
c.get_thread_buffer()[23] = v_z23;
c.get_thread_buffer()[24] = v_z24;
c.get_thread_buffer()[25] = v_z25;
c.get_thread_buffer()[26] = v_z26;
c.get_thread_buffer()[27] = v_z27;
c.get_thread_buffer()[28] = v_z28;
c.get_thread_buffer()[29] = v_z29;
c.get_thread_buffer()[30] = v_z30;
c.get_thread_buffer()[31] = v_z31;
c.get_thread_buffer()[32] = v_z32;
c.get_thread_buffer()[33] = v_z33;
c.get_thread_buffer()[34] = v_z34;
c.get_thread_buffer()[35] = v_z35;
c.get_thread_buffer()[36] = v_z36;
c.get_thread_buffer()[37] = v_z37;
c.get_thread_buffer()[38] = v_z38;
c.get_thread_buffer()[39] = v_z39;
c.get_thread_buffer()[40] = v_z40;
c.get_thread_buffer()[41] = v_z41;
c.get_thread_buffer()[42] = v_z42;
c.get_thread_buffer()[43] = v_z43;
c.get_thread_buffer()[44] = v_z44;
c.get_thread_buffer()[45] = v_z45;
c.get_thread_buffer()[46] = v_z46;
c.get_thread_buffer()[47] = v_z47;
c.get_thread_buffer()[48] = v_z48;
c.get_thread_buffer()[49] = v_z49;
c.get_thread_buffer()[50] = v_z50;
c.get_thread_buffer()[51] = v_z51;
c.get_thread_buffer()[52] = v_z52;
c.get_thread_buffer()[53] = v_z53;
c.get_thread_buffer()[54] = v_z54;
c.get_thread_buffer()[55] = v_z55;
c.get_thread_buffer()[56] = v_z56;
c.get_thread_buffer()[57] = v_z57;
c.get_thread_buffer()[58] = v_z58;
c.get_thread_buffer()[59] = v_z59;
c.get_thread_buffer()[60] = v_z60;
c.get_thread_buffer()[61] = v_z61;
c.get_thread_buffer()[62] = v_z62;
c.get_thread_buffer()[63] = v_z63;
return c; return c;
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment