firt version for 2 kernel

513abed6 · ltqin · 71e2a917 · 513abed6 · 513abed6 · 513abed6
Commit 513abed6 authored Jun 15, 2023 by ltqin
10 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
@@ -12,6 +12,7 @@ add_example_executable(example_grouped_multihead_attention_backward_v2 grouped_m
 add_example_executable(example_batched_multihead_attention_backward batched_multihead_attention_backward.cpp)
 add_example_executable(example_batched_multihead_attention_backward_v2 batched_multihead_attention_backward_v2.cpp)
 add_example_executable(example_batched_multihead_attention_backward_v3 batched_multihead_attention_backward_v3.cpp)
+add_example_executable(example_batched_multihead_attention_backward_v4 batched_multihead_attention_backward_v4.cpp)
 add_example_executable(example_grouped_multihead_attention_train grouped_multihead_attention_train.cpp)
 add_example_executable(example_batched_multihead_attention_train batched_multihead_attention_train.cpp)


--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
@@ -25,7 +25,7 @@ Kernel outputs:

 #define PRINT_HOST 0
 #define USING_MASK 0
-#define DIM 128 // DIM should be a multiple of 8.
+#define DIM 64 // DIM should be a multiple of 8.

 #include <iostream>
 #include <numeric>
@@ -259,7 +259,7 @@ void run_attention_fwd_host(const TensorQ& q_g_m_k,

 int run(int argc, char* argv[])
 {
-    bool do_verification = true;
+    bool do_verification = false;
    int init_method      = 2; // method 1 will have slightly higher error; TODO: to investigate
    bool time_kernel     = true;

@@ -271,8 +271,8 @@ int run(int argc, char* argv[])
    ck::index_t N  = 512;
    ck::index_t K  = DIM;
    ck::index_t O  = DIM;
-    ck::index_t G0 = 4;
-    ck::index_t G1 = 6;
+    ck::index_t G0 = 12;
+    ck::index_t G1 = 16;

    bool input_permute  = false;
    bool output_permute = false;

--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v4.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v4.cpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle_v6.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle_v6.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle_v7.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle_v7.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_bacckward_ydotygrad.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_bacckward_ydotygrad.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_pt2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_pt2.hpp
@@ -1448,7 +1448,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2
                                             1,
                                             false>{
                lse_grid_desc_mblock_mrepeat_mwave_mperxdl,
-                make_multi_index(block_work_idx_m,       // mblock
+                make_multi_index(block_work_idx_m,         // mblock
                                 acc0_thread_origin[I0],   // mrepeat
                                 acc0_thread_origin[I2],   // mwave
                                 acc0_thread_origin[I4])}; // mperxdl
@@ -1511,14 +1511,14 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2
            1, // DstScalarStrideInVector
            true>{z_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
                  make_multi_index(block_work_idx_m, // MBlockId
-                                   0,                  // NBlockId
-                                   0,                  // mrepeat
-                                   0,                  // nrepeat
-                                   wave_id[I0],        // MWaveId
-                                   wave_id[I1],        // NWaveId
-                                   wave_m_n_id[I1],    // MPerXdl
-                                   0,                  // group
-                                   wave_m_n_id[I0],    // NInputIndex
+                                   0,                // NBlockId
+                                   0,                // mrepeat
+                                   0,                // nrepeat
+                                   wave_id[I0],      // MWaveId
+                                   wave_id[I1],      // NWaveId
+                                   wave_m_n_id[I1],  // MPerXdl
+                                   0,                // group
+                                   wave_m_n_id[I0],  // NInputIndex
                                   0),
                  tensor_operation::element_wise::PassThrough{}};


--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_pt8.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_pt8.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_pt9.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_pt9.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_forward_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_forward_xdl_cshuffle.hpp
@@ -917,7 +917,7 @@ struct GridwiseBatchedMultiheadAttentionForward_Xdl_CShuffle
        {
            block_sync_lds();
        }
-        
+
        do
        {
            auto n_block_data_idx_on_grid =