Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
7cccb0a8
Commit
7cccb0a8
authored
Dec 14, 2024
by
Ye Wang
Browse files
Enable sbhd for customer case v3 bwd debug
parent
b39eecf1
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
34 deletions
+36
-34
example/ck_tile/01_fmha/CMakeLists.txt
example/ck_tile/01_fmha/CMakeLists.txt
+2
-2
example/ck_tile/01_fmha/fmha_bwd.cpp
example/ck_tile/01_fmha/fmha_bwd.cpp
+34
-32
No files found.
example/ck_tile/01_fmha/CMakeLists.txt
View file @
7cccb0a8
...
...
@@ -83,11 +83,11 @@ set(EXAMPLE_FMHA_BWD_COMPILE_OPTIONS)
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
# ... because they are auto-generated
if
(
FMHA_FWD_FAST_EXP2
)
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero
)
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1
-DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3
-fgpu-flush-denormals-to-zero
)
else
()
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0
)
endif
()
list
(
APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -fgpu-flush-denormals-to-zero
)
list
(
APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS
-DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3
-Wno-undefined-func-template -fgpu-flush-denormals-to-zero
)
# conditionally enable call to the fwd_splitkv API in fmha_fwd example
if
(
"fwd_splitkv"
IN_LIST FMHA_FWD_ENABLE_APIS
)
...
...
example/ck_tile/01_fmha/fmha_bwd.cpp
View file @
7cccb0a8
...
...
@@ -55,7 +55,7 @@ auto create_args(int argc, char* argv[])
.
insert
(
"iperm"
,
"1"
,
"permute input
\n
"
"if true, will be b*h*
s*
d, else b*s*h*d"
)
"if true, will be
s*
b*h*d, else b*s*h*d"
)
.
insert
(
"operm"
,
"1"
,
"permute output"
)
.
insert
(
"bias"
,
"n"
,
...
...
@@ -160,7 +160,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
if
(
hdim_v
<
0
)
hdim_v
=
hdim_q
;
bool
i_perm
=
arg_parser
.
get_bool
(
"iperm"
);
// if true, will be batch * nhead *
seqlen *
hdim
bool
i_perm
=
arg_parser
.
get_bool
(
"iperm"
);
// if true, will be
seqlen*
batch * nhead * hdim
bool
o_perm
=
arg_parser
.
get_bool
(
"operm"
);
// if false, will be batch * seqlen * nhead * hdim
float
scale
=
arg_parser
.
get_float
(
"scale"
);
...
...
@@ -287,7 +287,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile
::
index_t
s
/*seqlen*/
,
ck_tile
::
index_t
d
/*hdim*/
)
{
if
(
permute
)
return
std
::
array
<
ck_tile
::
index_t
,
4
>
{
b
,
h
,
s
,
d
};
return
std
::
array
<
ck_tile
::
index_t
,
4
>
{
s
,
b
,
h
,
d
};
else
return
std
::
array
<
ck_tile
::
index_t
,
4
>
{
b
,
s
,
h
,
d
};
};
...
...
@@ -417,7 +417,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
// clang-format off
auto
layout_str
=
[
&
](
bool
permute
){
if
(
permute
)
return
std
::
string
(
"bh
s
d"
);
if
(
permute
)
return
std
::
string
(
"
s
bhd"
);
else
return
std
::
string
(
"bshd"
);
};
auto
io_layout
=
[
&
](
bool
iperm_
,
bool
operm_
)
{
...
...
@@ -462,41 +462,43 @@ bool run(const ck_tile::ArgParser& arg_parser)
/// seqlen_k] in this example, hence both the 'batch_stride_bias' &
/// 'nhead_stride_bias' are 0.
// setup stride_* arguments
const
ck_tile
::
index_t
stride_q
=
(
i_perm
?
hdim_q
:
nhead
*
hdim_q
);
const
ck_tile
::
index_t
stride_k
=
(
i_perm
?
hdim_q
:
nhead_k
*
hdim_q
);
const
ck_tile
::
index_t
stride_v
=
(
i_perm
?
hdim_v
:
nhead_k
*
hdim_v
);
// bshd vs sbhd (perm)
const
ck_tile
::
index_t
stride_q
=
(
i_perm
?
batch
*
nhead
*
hdim_q
:
nhead
*
hdim_q
);
const
ck_tile
::
index_t
stride_k
=
(
i_perm
?
batch
*
nhead_k
*
hdim_q
:
nhead_k
*
hdim_q
);
const
ck_tile
::
index_t
stride_v
=
(
i_perm
?
batch
*
nhead_k
*
hdim_v
:
nhead_k
*
hdim_v
);
const
ck_tile
::
index_t
stride_bias
=
(
max_seqlen_k
);
const
ck_tile
::
index_t
stride_o
=
(
o_perm
?
hdim_v
:
nhead
*
hdim_v
);
const
ck_tile
::
index_t
stride_o
=
(
o_perm
?
batch
*
nhead
*
hdim_v
:
nhead
*
hdim_v
);
const
ck_tile
::
index_t
stride_randval
=
(
max_seqlen_k
);
const
ck_tile
::
index_t
stride_do
=
(
o_perm
?
hdim_v
:
nhead
*
hdim_v
);
const
ck_tile
::
index_t
stride_do
=
(
o_perm
?
batch
*
nhead
*
hdim_v
:
nhead
*
hdim_v
);
const
ck_tile
::
index_t
stride_dq_acc
=
hdim_q
;
const
ck_tile
::
index_t
stride_dk
=
(
i_perm
?
hdim_q
:
nhead
*
hdim_q
);
const
ck_tile
::
index_t
stride_dv
=
(
i_perm
?
hdim_v
:
nhead
*
hdim_v
);
const
ck_tile
::
index_t
stride_dk
=
(
i_perm
?
batch
*
nhead
*
hdim_q
:
nhead
*
hdim_q
);
const
ck_tile
::
index_t
stride_dv
=
(
i_perm
?
batch
*
nhead
*
hdim_v
:
nhead
*
hdim_v
);
const
ck_tile
::
index_t
stride_dbias
=
(
i_perm
?
max_seqlen_k
:
nhead
*
max_seqlen_k
);
// setup nhead_stride_* arguments
const
ck_tile
::
index_t
nhead_stride_q
=
(
i_perm
?
shape_seqlen_q
*
hdim_q
:
hdim_q
)
;
const
ck_tile
::
index_t
nhead_stride_k
=
(
i_perm
?
shape_seqlen_k
*
hdim_q
:
hdim_q
)
;
const
ck_tile
::
index_t
nhead_stride_v
=
(
i_perm
?
shape_seqlen_k
*
hdim_v
:
hdim_v
)
;
const
ck_tile
::
index_t
nhead_stride_q
=
hdim_q
;
const
ck_tile
::
index_t
nhead_stride_k
=
hdim_q
;
const
ck_tile
::
index_t
nhead_stride_v
=
hdim_v
;
const
ck_tile
::
index_t
nhead_stride_bias
=
0
;
const
ck_tile
::
index_t
nhead_stride_o
=
(
o_perm
?
shape_seqlen_q
*
hdim_v
:
hdim_v
)
;
const
ck_tile
::
index_t
nhead_stride_o
=
hdim_v
;
const
ck_tile
::
index_t
nhead_stride_randval
=
(
shape_seqlen_q
*
max_seqlen_k
);
const
ck_tile
::
index_t
nhead_stride_do
=
(
o_perm
?
shape_seqlen_q
*
hdim_v
:
hdim_v
)
;
const
ck_tile
::
index_t
nhead_stride_do
=
hdim_v
;
const
ck_tile
::
index_t
nhead_stride_lsed
=
shape_seqlen_q
;
const
ck_tile
::
index_t
nhead_stride_dq_acc
=
shape_seqlen_q
*
hdim_q
;
const
ck_tile
::
index_t
nhead_stride_dbias
=
(
i_perm
?
shape_seqlen_q
*
max_seqlen_k
:
max_seqlen_k
);
// setup batch_stride_* arguments
const
ck_tile
::
index_t
batch_stride_q
=
(
nhead
*
shape_seqlen_q
*
hdim_q
);
const
ck_tile
::
index_t
batch_stride_k
=
(
nhead_k
*
shape_seqlen_k
*
hdim_q
);
const
ck_tile
::
index_t
batch_stride_v
=
(
nhead_k
*
shape_seqlen_k
*
hdim_v
);
const
ck_tile
::
index_t
batch_stride_q
=
(
i_perm
?
nhead
*
hdim_q
:
nhead
*
shape_seqlen_q
*
hdim_q
);
const
ck_tile
::
index_t
batch_stride_k
=
(
i_perm
?
nhead_k
*
hdim_q
:
nhead_k
*
shape_seqlen_k
*
hdim_q
);
const
ck_tile
::
index_t
batch_stride_v
=
(
i_perm
?
nhead_k
*
hdim_v
:
nhead_k
*
shape_seqlen_k
*
hdim_v
);
const
ck_tile
::
index_t
batch_stride_bias
=
0
;
const
ck_tile
::
index_t
batch_stride_o
=
(
nhead
*
shape_seqlen_q
*
hdim_v
);
const
ck_tile
::
index_t
batch_stride_o
=
(
o_perm
?
nhead
*
hdim_v
:
nhead
*
shape_seqlen_q
*
hdim_v
);
const
ck_tile
::
index_t
batch_stride_randval
=
(
nhead
*
shape_seqlen_q
*
max_seqlen_k
);
const
ck_tile
::
index_t
batch_stride_do
=
(
nhead
*
shape_seqlen_q
*
hdim_v
);
const
ck_tile
::
index_t
batch_stride_do
=
(
o_perm
?
nhead
*
hdim_v
:
nhead
*
shape_seqlen_q
*
hdim_v
);
const
ck_tile
::
index_t
batch_stride_lsed
=
(
nhead
*
shape_seqlen_q
);
const
ck_tile
::
index_t
batch_stride_dk
=
(
nhead
*
shape_seqlen_k
*
hdim_q
);
const
ck_tile
::
index_t
batch_stride_dv
=
(
nhead
*
shape_seqlen_k
*
hdim_v
);
const
ck_tile
::
index_t
batch_stride_dk
=
(
i_perm
?
nhead
*
hdim_q
:
nhead
*
shape_seqlen_k
*
hdim_q
);
const
ck_tile
::
index_t
batch_stride_dv
=
(
i_perm
?
nhead
*
hdim_q
:
nhead
*
shape_seqlen_k
*
hdim_v
);
const
ck_tile
::
index_t
batch_stride_dbias
=
(
nhead
*
shape_seqlen_q
*
max_seqlen_k
);
const
ck_tile
::
index_t
batch_stride_dq_acc
=
(
nhead
*
shape_seqlen_q
*
hdim_q
);
const
ck_tile
::
index_t
split_stride_dq_acc
=
(
shape_batch
*
nhead
*
shape_seqlen_q
*
hdim_q
);
...
...
@@ -574,7 +576,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
batch_stride_randval
,
batch_stride_do
,
batch_stride_lsed
,
batch_stride_
q
,
// batch_stride_dq_acc
batch_stride_
dq_acc
,
// batch_stride_dq_acc
batch_stride_q
,
// batch_stride_dq
batch_stride_dk
,
batch_stride_dv
,
...
...
@@ -651,14 +653,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
// clang-format off
// permute
if
(
i_perm
)
q_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
q_host
(
b
,
i
[
0
],
i
[
1
]
+
query_offset
,
i
[
2
]);
});
if
(
i_perm
)
q_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
q_host
(
i
[
1
]
+
query_offset
,
b
,
i
[
0
],
i
[
2
]);
});
else
q_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
q_host
(
b
,
i
[
1
]
+
query_offset
,
i
[
0
],
i
[
2
]);
});
if
(
i_perm
)
k_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
k_host
(
b
,
i
[
0
]
/
nr
,
i
[
1
]
+
key_offset
,
i
[
2
]);
});
if
(
i_perm
)
k_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
k_host
(
i
[
1
]
+
key_offset
,
b
,
i
[
0
]
/
nr
,
i
[
2
]);
});
else
k_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
k_host
(
b
,
i
[
1
]
+
key_offset
,
i
[
0
]
/
nr
,
i
[
2
]);
});
// v_host_ref: [nhead, hdim, seq], v_host: [b, h_k, s, d]
if
(
i_perm
)
v_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
v_host
(
b
,
i
[
0
]
/
nr
,
i
[
2
]
+
key_offset
,
i
[
1
]);
});
if
(
i_perm
)
v_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
v_host
(
i
[
2
]
+
key_offset
,
b
,
i
[
0
]
/
nr
,
i
[
1
]);
});
// v_host_ref: [nhead, hdim, seq], v_host: [b, s, h_k, d]
else
v_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
v_host
(
b
,
i
[
2
]
+
key_offset
,
i
[
0
]
/
nr
,
i
[
1
]);
});
// clang-format on
...
...
@@ -799,7 +801,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
// clang-format off
// permute
if
(
o_perm
)
o_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
o_host
(
b
,
idx
[
0
],
idx
[
1
]
+
query_offset
,
idx
[
2
])
=
self
(
idx
);
});
if
(
o_perm
)
o_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
o_host
(
idx
[
1
]
+
query_offset
,
b
,
idx
[
0
],
idx
[
2
])
=
self
(
idx
);
});
else
o_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
o_host
(
b
,
idx
[
1
]
+
query_offset
,
idx
[
0
],
idx
[
2
])
=
self
(
idx
);
});
lse_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
lse_host
(
b
,
idx
[
0
],
idx
[
1
]
+
query_offset
)
=
self
(
idx
);
});
...
...
@@ -856,7 +858,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile
::
HostTensor
<
VGradDataType
>
dv_host_ref
({
nhead
,
real_seqlen_k
,
hdim_v
});
// dv_g_n_o
// clang-format off
if
(
o_perm
)
do_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
do_host
(
b
,
i
[
0
],
i
[
1
]
+
query_offset
,
i
[
2
]);
});
if
(
o_perm
)
do_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
do_host
(
i
[
1
]
+
query_offset
,
b
,
i
[
0
],
i
[
2
]);
});
else
do_host_ref
.
ForEach
([
&
](
auto
&
self
,
auto
i
)
{
self
(
i
)
=
do_host
(
b
,
i
[
1
]
+
query_offset
,
i
[
0
],
i
[
2
]);
});
// clang-format on
...
...
@@ -936,13 +938,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
// clang-format off
// permute
if
(
i_perm
)
dq_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
)
=
dq_host
(
b
,
idx
[
0
],
idx
[
1
]
+
query_offset
,
idx
[
2
]);
});
if
(
i_perm
)
dq_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
)
=
dq_host
(
idx
[
1
]
+
query_offset
,
b
,
idx
[
0
],
idx
[
2
]);
});
else
dq_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
)
=
dq_host
(
b
,
idx
[
1
]
+
query_offset
,
idx
[
0
],
idx
[
2
]);
});
if
(
i_perm
)
dk_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
)
=
dk_host
(
b
,
idx
[
0
],
idx
[
1
]
+
key_offset
,
idx
[
2
]);
});
if
(
i_perm
)
dk_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
)
=
dk_host
(
idx
[
1
]
+
key_offset
,
b
,
idx
[
0
],
idx
[
2
]);
});
else
dk_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
)
=
dk_host
(
b
,
idx
[
1
]
+
key_offset
,
idx
[
0
],
idx
[
2
]);
});
if
(
i_perm
)
dv_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
)
=
dv_host
(
b
,
idx
[
0
],
idx
[
1
]
+
key_offset
,
idx
[
2
]);
});
if
(
i_perm
)
dv_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
)
=
dv_host
(
idx
[
1
]
+
key_offset
,
b
,
idx
[
0
],
idx
[
2
]);
});
else
dv_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
)
=
dv_host
(
b
,
idx
[
1
]
+
key_offset
,
idx
[
0
],
idx
[
2
]);
});
if
(
use_dbias
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment