Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
de4494a2
Commit
de4494a2
authored
Aug 28, 2023
by
danyao12
Browse files
Bwd Qloop_PT1&Split K in LDS/V in Vgpr
parent
cc18fafa
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
4049 additions
and
107 deletions
+4049
-107
example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2_protro.cpp
...x_gemm/batched_multihead_attention_backward_v2_protro.cpp
+12
-11
example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v3_protro.cpp
...x_gemm/batched_multihead_attention_backward_v3_protro.cpp
+13
-12
include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v1_protro.hpp
...ce_batched_mha_bwd_xdl_cshuffle_qloop_light_v1_protro.hpp
+1444
-0
include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1_protro.hpp
...l/device_batched_mha_bwd_xdl_cshuffle_qloop_v1_protro.hpp
+4
-1
include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1_protro.hpp
...atched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1_protro.hpp
+2387
-0
include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1_protro3.hpp
...ise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1_protro3.hpp
+189
-83
No files found.
example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2_protro.cpp
View file @
de4494a2
This diff is collapsed.
Click to expand it.
example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v3_protro.cpp
View file @
de4494a2
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v1_protro.hpp
0 → 100644
View file @
de4494a2
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1_protro.hpp
View file @
de4494a2
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1_protro
2
.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1_protro
3
.hpp"
#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp"
#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/kernel_launch.hpp"
...
@@ -255,6 +255,7 @@ template <index_t NumDimG,
...
@@ -255,6 +255,7 @@ template <index_t NumDimG,
index_t
KPerBlock
,
// Gemm0KPerBlock
index_t
KPerBlock
,
// Gemm0KPerBlock
index_t
Gemm1NPerBlock
,
index_t
Gemm1NPerBlock
,
index_t
Gemm1KPerBlock
,
index_t
Gemm1KPerBlock
,
index_t
Gemm2KPerBlock
,
index_t
AK1
,
index_t
AK1
,
index_t
BK1
,
index_t
BK1
,
index_t
B1K1
,
index_t
B1K1
,
...
@@ -665,6 +666,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
...
@@ -665,6 +666,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
KPerBlock
,
KPerBlock
,
Gemm1NPerBlock
,
Gemm1NPerBlock
,
Gemm1KPerBlock
,
Gemm1KPerBlock
,
Gemm2KPerBlock
,
AK1
,
AK1
,
BK1
,
BK1
,
B1K1
,
B1K1
,
...
@@ -1283,6 +1285,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
...
@@ -1283,6 +1285,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
<<
MPerBlock
<<
", "
<<
MPerBlock
<<
", "
<<
Gemm1NPerBlock
<<
", "
<<
Gemm1NPerBlock
<<
", "
<<
Gemm1KPerBlock
<<
", "
<<
Gemm1KPerBlock
<<
", "
<<
Gemm2KPerBlock
<<
", "
<<
B1K1
<<
", "
<<
B1K1
<<
", "
<<
getGemmSpecializationString
(
GemmSpec
)
<<
", "
<<
getGemmSpecializationString
(
GemmSpec
)
<<
", "
<<
"ASpec"
<<
getTensorSpecializationString
(
ASpec
)
<<
", "
<<
"ASpec"
<<
getTensorSpecializationString
(
ASpec
)
<<
", "
...
...
include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1_protro.hpp
0 → 100644
View file @
de4494a2
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1_protro3.hpp
View file @
de4494a2
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment