constintcol_idx_limit_right=std::min(max_seqlen_k,row_idx+max_seqlen_k-max_seqlen_q);// attention, when max_seqlen_k == max_seqlen_q, vgpr can be reduced again
constexprint__kHeadDim=(REUSE_KV_TIMES>=16)?kHeadDim:kHeadDim+4/*<=15 can use misalign to reduce bank conflicts, but >16 may lead to lds>32KB, less waves per SIMD*/;