mi300 test decoder

f01a06c4 · danyao12 · 1128cd3a · f01a06c4 · f01a06c4 · f01a06c4
Commit f01a06c4 authored Jul 17, 2023 by danyao12
15 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v1.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v1.cpp
@@ -24,7 +24,7 @@ Kernel outputs:
 */
 #define PRINT_HOST 0
-#define USING_MASK 0
+#define USING_MASK 1
 #define DIM 64 // DIM should be a multiple of 8.
 #include <iostream>

--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
@@ -24,7 +24,7 @@ Kernel outputs:
 */
 #define PRINT_HOST 0
-#define USING_MASK 0
+#define USING_MASK 1
 #define DIM 128 // DIM should be a multiple of 8.
 #include <iostream>

--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2_phased.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2_phased.cpp
@@ -24,7 +24,7 @@ Kernel outputs:
 */
 #define PRINT_HOST 0
-#define USING_MASK 0
+#define USING_MASK 1
 #define DIM 64 // DIM should be a multiple of 8.
 #include <iostream>

--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_train_v1.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_train_v1.cpp
@@ -31,7 +31,7 @@ Kernel outputs:
 */
 #define PRINT_HOST 0
-#define USING_MASK 0
+#define USING_MASK 1
 #define DIM 128 // DIM should be a multiple of 8.
 #include <iostream>

--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_train_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_train_v2.cpp
@@ -31,7 +31,7 @@ Kernel outputs:
 */
 #define PRINT_HOST 0
-#define USING_MASK 0
+#define USING_MASK 1
 #define DIM 128 // DIM should be a multiple of 8.
 #include <iostream>

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_backward_v1.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_backward_v1.cpp
@@ -23,7 +23,7 @@ Kernel outputs:
 */
-#define USING_MASK 0
+#define USING_MASK 1
 #define DIM 64 // DIM should be a multiple of 8.
 #include <iostream>

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_backward_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_backward_v2.cpp
@@ -23,7 +23,7 @@ Kernel outputs:
 */
-#define USING_MASK 0
+#define USING_MASK 1
 #define DIM 128 // DIM should be a multiple of 8.
 #include <iostream>

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_train_v1.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_train_v1.cpp
@@ -30,7 +30,7 @@ Kernel outputs:
 */
-#define USING_MASK 0
+#define USING_MASK 1
 #define DIM 64 // DIM should be a multiple of 8.
 #include <iostream>

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_train_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_train_v2.cpp
@@ -30,7 +30,7 @@ Kernel outputs:
 */
-#define USING_MASK 0
+#define USING_MASK 1
 #define DIM 64 // DIM should be a multiple of 8.
 #include <iostream>

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
@@ -1867,11 +1867,11 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1
        {
            auto n_block_data_idx_on_grid =
                __builtin_amdgcn_readfirstlane(gemm1_k_block_outer_index * NPerBlock);
-            if(c0_matrix_mask.IsTileSkippable(
+            // if(c0_matrix_mask.IsTileSkippable(
-                   m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
+            //        m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
-            {
+            // {
-                continue;
+            //     continue;
-            }
+            // }
            // gemm dP
            // dP = dY * V^T

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v2.hpp
@@ -1775,11 +1775,11 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V2
        {
            auto n_block_data_idx_on_grid =
                __builtin_amdgcn_readfirstlane(gemm1_k_block_outer_index * NPerBlock);
-            if(c0_matrix_mask.IsTileSkippable(
+            // if(c0_matrix_mask.IsTileSkippable(
-                   m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
+            //        m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
-            {
+            // {
-                continue;
+            //     continue;
-            }
+            // }
            // S = Q * K^T
            gemm0_gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(
                q_grid_desc_k0_m_k1,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -1798,11 +1798,11 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
        {
            auto m_block_data_idx_on_grid =
                __builtin_amdgcn_readfirstlane(gemm0_m_block_outer_index * MPerBlock);
-            if(c0_matrix_mask.IsTileSkippable(
+            // if(c0_matrix_mask.IsTileSkippable(
-                   m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
+            //        m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
-            {
+            // {
-                continue;
+            //     continue;
-            }
+            // }
            // load ygrad
            gemm_tile_ygrad_blockwise_copy.Run(ygrad_grid_desc_o0_m_o1,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -1721,11 +1721,11 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
        {
            auto m_block_data_idx_on_grid =
                __builtin_amdgcn_readfirstlane(gemm0_m_block_outer_index * MPerBlock);
-            if(c0_matrix_mask.IsTileSkippable(
+            // if(c0_matrix_mask.IsTileSkippable(
-                   m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
+            //        m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
-            {
+            // {
-                continue;
+            //     continue;
-            }
+            // }
            //
            // calculate Y dot dY

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v1.hpp
@@ -926,11 +926,11 @@ struct GridwiseBatchedMultiheadAttentionForward_Xdl_CShuffle_V1
        {
            auto n_block_data_idx_on_grid =
                __builtin_amdgcn_readfirstlane(gemm1_k_block_outer_index * NPerBlock);
-            if(c0_matrix_mask.IsTileSkippable(
+            // if(c0_matrix_mask.IsTileSkippable(
-                   m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
+            //        m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
-            {
+            // {
-                continue;
+            //     continue;
-            }
+            // }
            // gemm0
            gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
                                                                   a_block_desc_ak0_m_ak1,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v2.hpp
@@ -1080,11 +1080,11 @@ struct GridwiseBatchedMultiheadAttentionForward_Xdl_CShuffle_V2
        {
            auto n_block_data_idx_on_grid =
                __builtin_amdgcn_readfirstlane(gemm1_k_block_outer_index * NPerBlock);
-            if(c0_matrix_mask.IsTileSkippable(
+            // if(c0_matrix_mask.IsTileSkippable(
-                   m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
+            //        m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
-            {
+            // {
-                continue;
+            //     continue;
-            }
+            // }
            // gemm0
            gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
                                                                   a_block_desc_ak0_m_ak1,