Commit 5ab137f4 authored by danyao12's avatar danyao12
Browse files

add traits

parent a0491b67
...@@ -333,38 +333,40 @@ float fmha_ext_bwd_(const ck_tile::stream_config& s, fmha_bwd_args a, unsigned ...@@ -333,38 +333,40 @@ float fmha_ext_bwd_(const ck_tile::stream_config& s, fmha_bwd_args a, unsigned
float fmha_bwd(fmha_bwd_traits t, fmha_bwd_args a, const ck_tile::stream_config& s){{ float fmha_bwd(fmha_bwd_traits t, fmha_bwd_args a, const ck_tile::stream_config& s){{
float r = -1; float r = -1;
if ((t.is_group_mode == false) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && if (t.uses_ext_asm == true){{
(a.seqlen_q == a.seqlen_k) && (a.seqlen_k % 128 == 0) && (a.hdim_q == 128) && (a.hdim_v == 128) && (t.is_deterministic == false)) {{ if ((t.is_group_mode == false) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) &&
if(t.data_type.compare("fp16") == 0){{ (a.seqlen_q == a.seqlen_k) && (a.seqlen_k % 128 == 0) && (a.hdim_q == 128) && (a.hdim_v == 128) && (t.is_deterministic == false)) {{
if(t.mask_type == mask_enum::no_mask){{ if(t.data_type.compare("fp16") == 0){{
using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; if(t.mask_type == mask_enum::no_mask){{
using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>;
const std::string bwd_ext_name = "bwd_ext_fp16_a32"; using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>;
r = fmha_ext_bwd_<dot_do_o_trait_, convert_dq_trait_>(s, a, bwd_fp16_a32, bwd_ext_name); const std::string bwd_ext_name = "bwd_ext_fp16_a32";
return r; r = fmha_ext_bwd_<dot_do_o_trait_, convert_dq_trait_>(s, a, bwd_fp16_a32, bwd_ext_name);
}} return r;
else if((t.mask_type != mask_enum::no_mask) && ((a.window_size_left == -1) && (a.window_size_right == 0))){{ }}
using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; else if((t.mask_type != mask_enum::no_mask) && ((a.window_size_left == -1) && (a.window_size_right == 0))){{
using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>;
const std::string bwd_ext_name = "bwd_ext_fp16_causal_a32"; using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>;
r = fmha_ext_bwd_<dot_do_o_trait_, convert_dq_trait_>(s, a, bwd_fp16_causal_a32, bwd_ext_name); const std::string bwd_ext_name = "bwd_ext_fp16_causal_a32";
return r; r = fmha_ext_bwd_<dot_do_o_trait_, convert_dq_trait_>(s, a, bwd_fp16_causal_a32, bwd_ext_name);
return r;
}}
}} }}
}} else if(t.data_type.compare("bf16") == 0){{
else if(t.data_type.compare("bf16") == 0){{ if(t.mask_type == mask_enum::no_mask){{
if(t.mask_type == mask_enum::no_mask){{ using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>;
using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>;
using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; const std::string bwd_ext_name = "bwd_ext_bf16_a32";
const std::string bwd_ext_name = "bwd_ext_bf16_a32"; r = fmha_ext_bwd_<dot_do_o_trait_, convert_dq_trait_>(s, a, bwd_bf16_a32, bwd_ext_name);
r = fmha_ext_bwd_<dot_do_o_trait_, convert_dq_trait_>(s, a, bwd_bf16_a32, bwd_ext_name); return r;
return r; }}
}} else if((t.mask_type != mask_enum::no_mask) && ((a.window_size_left == -1) && (a.window_size_right == 0))){{
else if((t.mask_type != mask_enum::no_mask) && ((a.window_size_left == -1) && (a.window_size_right == 0))){{ using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>;
using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>;
using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; const std::string bwd_ext_name = "bwd_ext_bf16_causal_a32";
const std::string bwd_ext_name = "bwd_ext_bf16_causal_a32"; r = fmha_ext_bwd_<dot_do_o_trait_, convert_dq_trait_>(s, a, bwd_bf16_causal_a32, bwd_ext_name);
r = fmha_ext_bwd_<dot_do_o_trait_, convert_dq_trait_>(s, a, bwd_bf16_causal_a32, bwd_ext_name); return r;
return r; }}
}} }}
}} }}
}} }}
......
...@@ -91,7 +91,9 @@ auto create_args(int argc, char* argv[]) ...@@ -91,7 +91,9 @@ auto create_args(int argc, char* argv[])
.insert("deterministic", .insert("deterministic",
"0", "0",
"if set to 1 will use multi-buffer reduction strategy for dq, atomic opeartion " "if set to 1 will use multi-buffer reduction strategy for dq, atomic opeartion "
"will not be used"); "will not be used")
.insert("ext_asm", "0", "if set to 1, some cases will call the ext asm dqdkdv kernel")
.insert("asm_atomic_fp32", "1", "if set to 0, atomic fp16/bf16 is used when calling the ext asm dqdkdv kernel");
bool result = arg_parser.parse(argc, argv); bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser); return std::make_tuple(result, arg_parser);
...@@ -176,10 +178,12 @@ bool run(const ck_tile::ArgParser& arg_parser) ...@@ -176,10 +178,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
seed.reset(); seed.reset();
} }
int stream_warmup = arg_parser.get_int("warmup"); int stream_warmup = arg_parser.get_int("warmup");
int stream_repeat = arg_parser.get_int("repeat"); int stream_repeat = arg_parser.get_int("repeat");
bool kname = arg_parser.get_bool("kname"); bool kname = arg_parser.get_bool("kname");
bool deterministic = arg_parser.get_bool("deterministic"); bool deterministic = arg_parser.get_bool("deterministic");
bool ext_asm = arg_parser.get_bool("ext_asm");
bool asm_atomic_fp32 = arg_parser.get_bool("asm_atomic_fp32");
ck_tile::stream_config stream_config{nullptr, ck_tile::stream_config stream_config{nullptr,
true, true,
...@@ -416,7 +420,9 @@ bool run(const ck_tile::ArgParser& arg_parser) ...@@ -416,7 +420,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
use_dbias, use_dbias,
p_drop > 0.0f, p_drop > 0.0f,
s_randval, s_randval,
deterministic}; deterministic,
ext_asm,
asm_atomic_fp32};
auto fmha_args = [&]() { auto fmha_args = [&]() {
assert(nhead % nhead_k == 0); assert(nhead % nhead_k == 0);
/// NOTE: we broadcast bias from [1, 1, seqlen_q, seqlen_k] to [batch, nhead, seqlen_q, /// NOTE: we broadcast bias from [1, 1, seqlen_q, seqlen_k] to [batch, nhead, seqlen_q,
......
...@@ -438,6 +438,8 @@ struct fmha_bwd_traits ...@@ -438,6 +438,8 @@ struct fmha_bwd_traits
bool has_dropout; bool has_dropout;
bool is_store_randval; bool is_store_randval;
bool is_deterministic; bool is_deterministic;
bool uses_ext_asm;
bool is_asm_atomic_fp32;
// TODO: padding check is inside this api // TODO: padding check is inside this api
}; };
float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&); float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment