Commit d4de8495 authored by danyao12's avatar danyao12
Browse files

rename & ensure thread safety

parent 871c7556
...@@ -67,7 +67,7 @@ set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd") ...@@ -67,7 +67,7 @@ set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd")
# to be included in "make all/install/check" # to be included in "make all/install/check"
message("adding example ${EXAMPLE_FMHA_BWD}") message("adding example ${EXAMPLE_FMHA_BWD}")
add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL hsaco/bwd_bf16_a16.cpp hsaco/bwd_bf16_a16_rtz.cpp hsaco/bwd_bf16_a32.cpp hsaco/bwd_bf16_causal_a16.cpp hsaco/bwd_bf16_causal_a16_rtz.cpp hsaco/bwd_bf16_causal_a32.cpp hsaco/bwd_bf16_nocoex_a32.cpp hsaco/bwd_bf16_nocoex_causal_a32.cpp hsaco/bwd_fp16_a16.cpp hsaco/bwd_fp16_a32.cpp hsaco/bwd_fp16_causal_a16.cpp hsaco/bwd_fp16_causal_a32.cpp hsaco/bwd_fp16_nocoex_a32.cpp hsaco/bwd_fp16_nocoex_causal_a32.cpp fmha_bwd.cpp) add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL hsaco/bwd_bf16_a16.cpp hsaco/bwd_bf16_a16_rtz.cpp hsaco/bwd_bf16_a32.cpp hsaco/bwd_bf16_causal_a16.cpp hsaco/bwd_bf16_causal_a16_rtz.cpp hsaco/bwd_bf16_causal_a32.cpp hsaco/bwd_bf16_spec_a32.cpp hsaco/bwd_bf16_spec_causal_a32.cpp hsaco/bwd_fp16_a16.cpp hsaco/bwd_fp16_a32.cpp hsaco/bwd_fp16_causal_a16.cpp hsaco/bwd_fp16_causal_a32.cpp hsaco/bwd_fp16_spec_a32.cpp hsaco/bwd_fp16_spec_causal_a32.cpp fmha_bwd.cpp)
target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS}) target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS})
......
...@@ -92,17 +92,17 @@ auto create_args(int argc, char* argv[]) ...@@ -92,17 +92,17 @@ auto create_args(int argc, char* argv[])
"0", "0",
"if set to 1 will use multi-buffer reduction strategy for dq, atomic opeartion " "if set to 1 will use multi-buffer reduction strategy for dq, atomic opeartion "
"will not be used") "will not be used")
.insert("ext_asm", "0", "if set to 1, some cases will call the ext asm dqdkdv kernel") .insert("bwd_v3", "0", "if set to 1, some cases will call the bwd v3 dqdkdv kernel")
.insert( .insert(
"asm_atomic_fp32", "v3_atomic_fp32",
"1", "1",
"if set to 0 will use atomic fp16/bf16(w/o convert_dq kernel) when ext_asm is set to 1") "if set to 0 will use atomic fp16/bf16(w/o convert_dq kernel) when bwd_v3 is set to 1")
.insert("asm_no_coex", .insert("v3_spec",
"0", "0",
"if set to 1 will use non-coexectuion kernel when ext_asm is set to 1") "if set to 1 will call the specialized v3 kernel when bwd_v3 is set to 1")
.insert("asm_rtz_cvt", .insert("v3_rtz_cvt",
"0", "0",
"if set to 1 will use float to bf16 RTZ convert when ext_asm is set to 1"); "if set to 1 will use float to bf16 RTZ convert when bwd_v3 is set to 1");
bool result = arg_parser.parse(argc, argv); bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser); return std::make_tuple(result, arg_parser);
...@@ -187,14 +187,14 @@ bool run(const ck_tile::ArgParser& arg_parser) ...@@ -187,14 +187,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
seed.reset(); seed.reset();
} }
int stream_warmup = arg_parser.get_int("warmup"); int stream_warmup = arg_parser.get_int("warmup");
int stream_repeat = arg_parser.get_int("repeat"); int stream_repeat = arg_parser.get_int("repeat");
bool kname = arg_parser.get_bool("kname"); bool kname = arg_parser.get_bool("kname");
bool deterministic = arg_parser.get_bool("deterministic"); bool deterministic = arg_parser.get_bool("deterministic");
bool ext_asm = arg_parser.get_bool("ext_asm"); bool bwd_v3 = arg_parser.get_bool("bwd_v3");
bool asm_atomic_fp32 = arg_parser.get_bool("asm_atomic_fp32"); bool v3_atomic_fp32 = arg_parser.get_bool("v3_atomic_fp32");
bool asm_no_coex = arg_parser.get_bool("asm_no_coex"); bool v3_spec = arg_parser.get_bool("v3_spec");
bool asm_rtz_cvt = arg_parser.get_bool("asm_rtz_cvt"); bool v3_rtz_cvt = arg_parser.get_bool("v3_rtz_cvt");
ck_tile::stream_config stream_config{nullptr, ck_tile::stream_config stream_config{nullptr,
true, true,
...@@ -430,10 +430,10 @@ bool run(const ck_tile::ArgParser& arg_parser) ...@@ -430,10 +430,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
p_drop > 0.0f, p_drop > 0.0f,
s_randval, s_randval,
deterministic, deterministic,
ext_asm, bwd_v3,
asm_atomic_fp32, v3_atomic_fp32,
asm_no_coex, v3_spec,
asm_rtz_cvt}; v3_rtz_cvt};
auto fmha_args = [&]() { auto fmha_args = [&]() {
assert(nhead % nhead_k == 0); assert(nhead % nhead_k == 0);
/// NOTE: we broadcast bias from [1, 1, seqlen_q, seqlen_k] to [batch, nhead, seqlen_q, /// NOTE: we broadcast bias from [1, 1, seqlen_q, seqlen_k] to [batch, nhead, seqlen_q,
......
...@@ -438,10 +438,10 @@ struct fmha_bwd_traits ...@@ -438,10 +438,10 @@ struct fmha_bwd_traits
bool has_dropout; bool has_dropout;
bool is_store_randval; bool is_store_randval;
bool is_deterministic; bool is_deterministic;
bool uses_ext_asm; bool uses_bwd_v3;
bool is_asm_atomic_fp32; bool is_v3_atomic_fp32;
bool is_asm_no_coex; bool is_v3_spec;
bool is_asm_rtz_cvt; bool is_v3_rtz_cvt;
// TODO: padding check is inside this api // TODO: padding check is inside this api
}; };
float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&); float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&);
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "fmha_hsaco.hpp" #include "fmha_hsaco.hpp"
unsigned char bwd_bf16_nocoex_a32[] = { unsigned char bwd_bf16_spec_a32[] = {
0x7F, 0x45, 0x4C, 0x46, 0x02, 0x01, 0x01, 0x40, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7F, 0x45, 0x4C, 0x46, 0x02, 0x01, 0x01, 0x40, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x03, 0x00, 0xE0, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0xE0, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB0, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB0, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "fmha_hsaco.hpp" #include "fmha_hsaco.hpp"
unsigned char bwd_bf16_nocoex_causal_a32[] = { unsigned char bwd_bf16_spec_causal_a32[] = {
0x7F, 0x45, 0x4C, 0x46, 0x02, 0x01, 0x01, 0x40, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7F, 0x45, 0x4C, 0x46, 0x02, 0x01, 0x01, 0x40, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x03, 0x00, 0xE0, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0xE0, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "fmha_hsaco.hpp" #include "fmha_hsaco.hpp"
unsigned char bwd_fp16_nocoex_a32[] = { unsigned char bwd_fp16_spec_a32[] = {
0x7F, 0x45, 0x4C, 0x46, 0x02, 0x01, 0x01, 0x40, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7F, 0x45, 0x4C, 0x46, 0x02, 0x01, 0x01, 0x40, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x03, 0x00, 0xE0, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0xE0, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "fmha_hsaco.hpp" #include "fmha_hsaco.hpp"
unsigned char bwd_fp16_nocoex_causal_a32[] = { unsigned char bwd_fp16_spec_causal_a32[] = {
0x7F, 0x45, 0x4C, 0x46, 0x02, 0x01, 0x01, 0x40, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7F, 0x45, 0x4C, 0x46, 0x02, 0x01, 0x01, 0x40, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x03, 0x00, 0xE0, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0xE0, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
......
...@@ -9,11 +9,11 @@ extern unsigned char bwd_bf16_a32[]; ...@@ -9,11 +9,11 @@ extern unsigned char bwd_bf16_a32[];
extern unsigned char bwd_bf16_causal_a16[]; extern unsigned char bwd_bf16_causal_a16[];
extern unsigned char bwd_bf16_causal_a16_rtz[]; extern unsigned char bwd_bf16_causal_a16_rtz[];
extern unsigned char bwd_bf16_causal_a32[]; extern unsigned char bwd_bf16_causal_a32[];
extern unsigned char bwd_bf16_nocoex_a32[]; extern unsigned char bwd_bf16_spec_a32[];
extern unsigned char bwd_bf16_nocoex_causal_a32[]; extern unsigned char bwd_bf16_spec_causal_a32[];
extern unsigned char bwd_fp16_a16[]; extern unsigned char bwd_fp16_a16[];
extern unsigned char bwd_fp16_a32[]; extern unsigned char bwd_fp16_a32[];
extern unsigned char bwd_fp16_causal_a16[]; extern unsigned char bwd_fp16_causal_a16[];
extern unsigned char bwd_fp16_causal_a32[]; extern unsigned char bwd_fp16_causal_a32[];
extern unsigned char bwd_fp16_nocoex_a32[]; extern unsigned char bwd_fp16_spec_a32[];
extern unsigned char bwd_fp16_nocoex_causal_a32[]; extern unsigned char bwd_fp16_spec_causal_a32[];
...@@ -9,23 +9,23 @@ for hdim in 128 ; do ...@@ -9,23 +9,23 @@ for hdim in 128 ; do
nhead=$((2048 / $hdim)) # follow fav2 setup nhead=$((2048 / $hdim)) # follow fav2 setup
$EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -asm_atomic_fp32=0 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v3_atomic_fp32=0 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -asm_atomic_fp32=0 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v3_atomic_fp32=0 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=8 -h=$nhead -d=$hdim -s=2048 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=8 -h=$nhead -d=$hdim -s=2048 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=8 -h=$nhead -d=$hdim -s=2048 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=8 -h=$nhead -d=$hdim -s=2048 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=8 -h=$nhead -d=$hdim -s=2048 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -asm_atomic_fp32=0 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=8 -h=$nhead -d=$hdim -s=2048 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v3_atomic_fp32=0 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=4 -h=$nhead -d=$hdim -s=4096 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=4 -h=$nhead -d=$hdim -s=4096 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=4 -h=$nhead -d=$hdim -s=4096 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=4 -h=$nhead -d=$hdim -s=4096 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=4 -h=$nhead -d=$hdim -s=4096 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -asm_atomic_fp32=0 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=4 -h=$nhead -d=$hdim -s=4096 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v3_atomic_fp32=0 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=2 -h=$nhead -d=$hdim -s=8192 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=2 -h=$nhead -d=$hdim -s=8192 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=2 -h=$nhead -d=$hdim -s=8192 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=2 -h=$nhead -d=$hdim -s=8192 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=2 -h=$nhead -d=$hdim -s=8192 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -asm_atomic_fp32=0 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=2 -h=$nhead -d=$hdim -s=8192 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v3_atomic_fp32=0 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=1 -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=1 -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=1 -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=1 -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=1 -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kname=1 -ext_asm=1 -asm_atomic_fp32=0 -v=$VALID ; sleep 3 $EXE -prec=$prec -b=1 -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kname=1 -bwd_v3=1 -v3_atomic_fp32=0 -v=$VALID ; sleep 3
done done
done done
......
...@@ -11,12 +11,12 @@ set -x ...@@ -11,12 +11,12 @@ set -x
for prec in "fp16" "bf16" ; do for prec in "fp16" "bf16" ; do
for perm in 0 1 ; do for perm in 0 1 ; do
for hdim in 128 ; do for hdim in 128 ; do
for asm_atomic_fp32 in 0 1 ; do for v3_atomic_fp32 in 0 1 ; do
for asm_no_coex in 0 1 ; do for v3_spec in 0 1 ; do
for mask in 0 1 ; do for mask in 0 1 ; do
$EXE -prec=$prec -b=4 -h=2 -d=$hdim -s=512 -iperm=$perm -operm=$perm -mask=$mask -ext_asm=1 -asm_atomic_fp32=$asm_atomic_fp32 -asm_no_coex=$asm_no_coex -mode=0 -kname=$KNAME $COMMON_ARGS $EXE -prec=$prec -b=4 -h=2 -d=$hdim -s=512 -iperm=$perm -operm=$perm -mask=$mask -bwd_v3=1 -v3_atomic_fp32=$v3_atomic_fp32 -v3_spec=$v3_spec -mode=0 -kname=$KNAME $COMMON_ARGS
$EXE -prec=$prec -b=1 -h=3 -d=$hdim -s=768 -iperm=$perm -operm=$perm -mask=$mask -ext_asm=1 -asm_atomic_fp32=$asm_atomic_fp32 -asm_no_coex=$asm_no_coex -mode=0 -kname=$KNAME $COMMON_ARGS $EXE -prec=$prec -b=1 -h=3 -d=$hdim -s=768 -iperm=$perm -operm=$perm -mask=$mask -bwd_v3=1 -v3_atomic_fp32=$v3_atomic_fp32 -v3_spec=$v3_spec -mode=0 -kname=$KNAME $COMMON_ARGS
done done
done done
......
...@@ -13,8 +13,8 @@ for perm in 0 1 ; do ...@@ -13,8 +13,8 @@ for perm in 0 1 ; do
for hdim in 128 ; do for hdim in 128 ; do
for mask in 0 1 ; do for mask in 0 1 ; do
$EXE -prec=$prec -b=2 -h=4 -h_k=2 -d=$hdim -s=512 -iperm=$perm -operm=$perm -mask=$mask -ext_asm=1 -asm_atomic_fp32=0 -asm_rtz_cvt=1 -mode=0 -kname=$KNAME $COMMON_ARGS $EXE -prec=$prec -b=2 -h=4 -h_k=2 -d=$hdim -s=512 -iperm=$perm -operm=$perm -mask=$mask -bwd_v3=1 -v3_atomic_fp32=0 -v3_rtz_cvt=1 -mode=0 -kname=$KNAME $COMMON_ARGS
$EXE -prec=$prec -b=1 -h=3 -h_k=1 -d=$hdim -s=768 -iperm=$perm -operm=$perm -mask=$mask -ext_asm=1 -asm_atomic_fp32=0 -asm_rtz_cvt=1 -mode=0 -kname=$KNAME $COMMON_ARGS $EXE -prec=$prec -b=1 -h=3 -h_k=1 -d=$hdim -s=768 -iperm=$perm -operm=$perm -mask=$mask -bwd_v3=1 -v3_atomic_fp32=0 -v3_rtz_cvt=1 -mode=0 -kname=$KNAME $COMMON_ARGS
done done
done done
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment