Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
512eeecb
Commit
512eeecb
authored
Jan 31, 2025
by
Qianfeng Zhang
Browse files
Roll-back to load Q directly from global memory instead of using LDS as intermediary stop
parent
3e411ef0
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
2 additions
and
31 deletions
+2
-31
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
.../ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+2
-31
No files found.
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
View file @
512eeecb
...
...
@@ -179,16 +179,11 @@ struct BlockFmhaPipelineQRKSVSAsync
make_tile_window
(
q_dram_block_window_tmp
.
get_bottom_tensor_view
(),
q_dram_block_window_tmp
.
get_window_lengths
(),
q_dram_block_window_tmp
.
get_window_origin
(),
Policy
::
template
MakeQ
Dram
TileDistribution
<
Problem
>());
auto
original_
q
=
load_tile
(
q_dram_window
);
Policy
::
template
MakeQ
Reg
TileDistribution
<
Problem
>());
auto
q
=
load_tile
(
q_dram_window
);
__builtin_amdgcn_sched_barrier
(
0
);
// Q tile in LDS
QDataType
*
q_lds_ptr
=
static_cast
<
QDataType
*>
(
smem_ptr
);
auto
q_lds
=
make_tensor_view
<
address_space_enum
::
lds
>
(
q_lds_ptr
,
Policy
::
template
MakeQLdsBlockDescriptor
<
Problem
>());
// K tile in LDS
KDataType
*
k_lds_ptr
=
static_cast
<
KDataType
*>
(
smem_ptr
);
auto
k_lds
=
make_tensor_view
<
address_space_enum
::
lds
>
(
...
...
@@ -295,32 +290,8 @@ struct BlockFmhaPipelineQRKSVSAsync
{
0
,
seqlen_k_start
},
// TODO: hdim split?
Policy
::
template
MakeVDramTileDistribution
<
Problem
>());
// store Q into LDS
__builtin_amdgcn_sched_barrier
(
0
);
auto
q_lds_window_for_store
=
make_tile_window
(
q_lds
,
Policy
::
template
MakeQLdsBlockDescriptor
<
Problem
>().
get_lengths
(),
{
0
,
0
});
store_tile
(
q_lds_window_for_store
,
original_q
);
__builtin_amdgcn_sched_barrier
(
0
);
// load Q from LDS
auto
q_lds_window_for_load
=
make_tile_window
(
q_lds
,
Policy
::
template
MakeQLdsBlockDescriptor
<
Problem
>().
get_lengths
(),
{
0
,
0
},
Policy
::
template
MakeQRegTileDistribution
<
Problem
>());
block_sync_lds
();
auto
q
=
load_tile
(
q_lds_window_for_load
);
auto
q_tile
=
tile_elementwise_in
(
q_element_func
,
q
);
__builtin_amdgcn_sched_barrier
(
0
);
index_t
i_total_loops
=
0
;
// ensure loading of Q from LDS completely done
block_sync_lds
();
do
{
if
(
i_total_loops
==
0
)
// executed by fist iteration
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment