add static check for vector load

422a69b2 · letaoqin · 6c971dc8 · 422a69b2 · 422a69b2 · 422a69b2
Commit 422a69b2 authored Oct 18, 2023 by letaoqin
7 changed files
--- a/example/52_flash_atten_bias/grouped_multihead_attention_bias_backward_v2.cpp
+++ b/example/52_flash_atten_bias/grouped_multihead_attention_bias_backward_v2.cpp
@@ -24,7 +24,7 @@ Kernel outputs:
 */
 #define USING_MASK 0
-#define DIM 128 // DIM should be a multiple of 8.
+#define DIM 64 // DIM should be a multiple of 8.
 #include <iostream>
 #include <numeric>

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1.hpp
@@ -88,6 +88,9 @@ template <typename InputDataType,
          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
 {
+    static_assert(AK1Value % ABlockTransferDstScalarPerVector_AK1 == 0);
+    static_assert(BK1Value % BBlockTransferDstScalarPerVector_BK1 == 0);
    static_assert(KPerBlock == Gemm1NPerBlock);
    static_assert(MPerBlock % Gemm1KPerBlock == 0);
    static_assert(NPerBlock % Gemm2KPerBlock == 0);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2.hpp
@@ -96,6 +96,10 @@ template <typename InputDataType,
          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
 {
+    static_assert(AK1Value % ABlockTransferDstScalarPerVector_AK1 == 0);
+    static_assert(BK1Value % BBlockTransferDstScalarPerVector_BK1 == 0);
+    static_assert(B1K1Value % B1BlockTransferDstScalarPerVector_BK1 == 0);
    static_assert(Gemm1NPerBlock % KPerBlock == 0);
    static_assert(MPerBlock % Gemm1KPerBlock == 0);
    static_assert(NPerBlock % Gemm2KPerBlock == 0);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -87,6 +87,9 @@ template <typename InputDataType,
          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
 {
+    static_assert(AK1Value % ABlockTransferDstScalarPerVector_AK1 == 0);
+    static_assert(BK1Value % BBlockTransferDstScalarPerVector_BK1 == 0);
    static_assert(KPerBlock == Gemm1NPerBlock);
    static_assert(MPerBlock % Gemm1KPerBlock == 0);
    static_assert(NPerBlock % Gemm2KPerBlock == 0);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -95,6 +95,10 @@ template <typename InputDataType,
          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
 {
+    static_assert(AK1Value % ABlockTransferDstScalarPerVector_AK1 == 0);
+    static_assert(BK1Value % BBlockTransferDstScalarPerVector_BK1 == 0);
+    static_assert(B1K1Value % B1BlockTransferDstScalarPerVector_BK1 == 0);
    static_assert(Gemm1NPerBlock % KPerBlock == 0);
    static_assert(MPerBlock % Gemm1KPerBlock == 0);
    static_assert(NPerBlock % Gemm2KPerBlock == 0);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v2.hpp
@@ -97,6 +97,10 @@ template <typename FloatAB,
          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseBatchedMultiheadAttentionForward_Xdl_CShuffle_V2
 {
+    static_assert(AK1Value % ABlockTransferDstScalarPerVector_AK1 == 0);
+    static_assert(BK1Value % BBlockTransferDstScalarPerVector_BK1 == 0);
+    static_assert(B1K1Value % B1BlockTransferDstScalarPerVector_BK1 == 0);
    static_assert(D0BlockTransferSrcScalarPerVector == 1 ||
                      D0BlockTransferSrcScalarPerVector == 2 ||
                      D0BlockTransferSrcScalarPerVector == 4,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_infer_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_infer_xdl_cshuffle.hpp
@@ -88,6 +88,10 @@ template <typename FloatAB,
          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseMultiHeadFlashAttentionInfer_Xdl_CShuffle
 {
+    static_assert(AK1Value % ABlockTransferDstScalarPerVector_AK1 == 0);
+    static_assert(BK1Value % BBlockTransferDstScalarPerVector_BK1 == 0);
+    static_assert(B1K1Value % B1BlockTransferDstScalarPerVector_BK1 == 0);
    static_assert(D0BlockTransferSrcScalarPerVector == 1 ||
                      D0BlockTransferSrcScalarPerVector == 2 ||
                      D0BlockTransferSrcScalarPerVector == 4,