// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

// auto generated by generate.py
#include "fmha_fwd.hpp"

float fmha_fwd(fmha_fwd_traits t, fmha_fwd_args a, const ck_tile::stream_config& s){
    float r = -1;
    if(t.data_type.compare("fp16") == 0){
        if (t.hdim_q <= 32 && t.hdim_v <= 32) {
            if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdFp16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }

        }
        else if (t.hdim_q <= 64 && t.hdim_v <= 64) {
            if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }

        }
        else if (t.hdim_q <= 128 && t.hdim_v <= 128) {
            if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }

        }
        else if (t.hdim_q <= 256 && t.hdim_v <= 256) {
            if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }

        }

    }
    else if(t.data_type.compare("bf16") == 0){
        if (t.hdim_q <= 32 && t.hdim_v <= 32) {
            if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, false, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 32 != 0*/) && (true /*a.hdim_v % 32 != 0*/)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<32, FmhaFwdBf16, true, 128, 64, 16, 32, 32, 32, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }

        }
        else if (t.hdim_q <= 64 && t.hdim_v <= 64) {
            if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 64 != 0*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 64 != 0*/) && (true /*a.hdim_v % 64 != 0*/)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdBf16, true, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }

        }
        else if (t.hdim_q <= 128 && t.hdim_v <= 128) {
            if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 128 != 0*/) && (true /*a.hdim_v % 128 != 0*/)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdBf16, true, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }

        }
        else if (t.hdim_q <= 256 && t.hdim_v <= 256) {
            if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == true) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == true) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == false) &&
                        (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdBf16, true, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>;
                return fmha_fwd_<trait_>(s, a);
            }

        }

    }
    else if(t.data_type.compare("fp8") == 0){
        if (t.hdim_q <= 64 && t.hdim_v <= 64) {
            if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp8, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp8, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp8, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp8, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp8, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0)) {
                using trait_ = fmha_fwd_traits_<64, FmhaFwdFp8, false, 128, 64, 32, 64, 32, 64, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }

        }
        else if (t.hdim_q <= 128 && t.hdim_v <= 128) {
            if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp8, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp8, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp8, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp8, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp8, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0)) {
                using trait_ = fmha_fwd_traits_<128, FmhaFwdFp8, false, 128, 128, 32, 128, 32, 128, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }

        }
        else if (t.hdim_q <= 256 && t.hdim_v <= 256) {
            if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp8, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp8, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp8, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<false>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp8, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::elementwise_bias) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp8, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }
            else if((t.is_group_mode == false) && (t.is_v_rowmajor == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false)  && (t.has_dropout == false) && (t.do_fp8_static_quant == true) &&
                        (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) {
                using trait_ = fmha_fwd_traits_<256, FmhaFwdFp8, false, 128, 128, 32, 256, 32, 256, false, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask<true>, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false, false>;
                return fmha_fwd_<trait_>(s, a);
            }

        }

    }

    return r;
}
