array_aligned<bf16,cosize_v<SmemLayoutS>>s[D_QK==576?1:2];// For V3.2 (whose D_QK is 576), we overlap sS[0] with k's RoPE part to save shared memory; For MODEL1 (whose D_QK is 512), we allocate two buffers
boolis_kv_valid[2][B_TOPK];
float2sM[32];
float2sL[64];// For reduction across WG0/1 in epilogue