Bwd Qloop_PT2&Split K in LDS/V in Vgpr

64b5f20f · danyao12 · e296ee56 · 64b5f20f · 64b5f20f · 64b5f20f
Commit 64b5f20f authored Aug 25, 2023 by danyao12
13 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
@@ -15,9 +15,11 @@ add_example_executable(example_grouped_multihead_attention_forward_v2 grouped_mu
 add_example_executable(example_batched_multihead_attention_forward_v2 batched_multihead_attention_forward_v2.cpp)
 add_example_executable(example_grouped_multihead_attention_backward_v2 grouped_multihead_attention_backward_v2.cpp)
 add_example_executable(example_batched_multihead_attention_backward_v2 batched_multihead_attention_backward_v2.cpp)
+add_example_executable(example_batched_multihead_attention_backward_v2_protro batched_multihead_attention_backward_v2_protro.cpp)
 add_example_executable(example_grouped_multihead_attention_train_v2 grouped_multihead_attention_train_v2.cpp)
 add_example_executable(example_batched_multihead_attention_train_v2 batched_multihead_attention_train_v2.cpp)
 add_example_executable(example_batched_multihead_attention_backward_v3 batched_multihead_attention_backward_v3.cpp)
+add_example_executable(example_batched_multihead_attention_backward_v3_protro batched_multihead_attention_backward_v3_protro.cpp)
 add_example_executable(example_grouped_multihead_attention_backward_v3 grouped_multihead_attention_backward_v3.cpp)
 add_custom_target(example_gemm_scale_softmax_gemm)

--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2_protro.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2_protro.cpp
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v3_protro.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v3_protro.cpp
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -879,6 +879,18 @@ struct BlockwiseGemmXdlops_v2
        b_thread_copy_.SetSrcCoord(b_origin);
    }
+    template <typename SrcSliceMoveStepIdx>
+    __device__ void MoveABlockSrcSliceWindow(const SrcSliceMoveStepIdx& src_slice_move_step_idx)
+    {
+        a_thread_copy_.MoveSrcSliceWindow(a_block_desc_m0_m1_m2_k, src_slice_move_step_idx);
+    }
+    template <typename SrcSliceMoveStepIdx>
+    __device__ void MoveBBlockSrcSliceWindow(const SrcSliceMoveStepIdx& src_slice_move_step_idx)
+    {
+        b_thread_copy_.MoveSrcSliceWindow(b_block_desc_n0_n1_n2_k, src_slice_move_step_idx);
+    }
    // transposed XDL output supporting C_xdl' = B_xdl' * A_xdl'
    __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
    {

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v2_protro.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v2_protro.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1_protro.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1_protro.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2_protro.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2_protro.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2_protro.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2_protro.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1_protro.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1_protro.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1_protro2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1_protro2.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2_protro.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2_protro.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2_protro2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2_protro2.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2_protro3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2_protro3.hpp