NXdlPerWave must be one

22593950 · letaoqin · 5a032110 · 22593950 · 22593950 · 22593950
Commit 22593950 authored Nov 07, 2023 by letaoqin
4 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1.hpp
@@ -1237,6 +1237,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
            static constexpr index_t Size  = sizeof(ck::half_t);
        };
        static constexpr index_t NThreadClusterLengths = 32;
+        static_assert(NXdlPerWave == 1);
        static_assert(NPerXdl == 32);
        static_assert(D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock,
                      "D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock");

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2.hpp
@@ -1316,7 +1316,9 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
            static constexpr index_t Size0 = 0;
            static constexpr index_t Size  = sizeof(ck::half_t);
        };
        static constexpr index_t NThreadClusterLengths = 32;
+        static_assert(NXdlPerWave == 1);
        static_assert(NPerXdl == 32);
        static_assert(D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock,
                      "D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock");

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -1304,7 +1304,9 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            static constexpr index_t Size0 = 0;
            static constexpr index_t Size  = sizeof(ck::half_t);
        };
        static constexpr index_t NThreadClusterLengths = 32;
+        static_assert(NXdlPerWave == 1);
        static_assert(NPerXdl == 32);
        static_assert(D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock,
                      "D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock");

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -1370,7 +1370,9 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            static constexpr index_t Size0 = 0;
            static constexpr index_t Size  = sizeof(ck::half_t);
        };
        static constexpr index_t NThreadClusterLengths = 32;
+        static_assert(NXdlPerWave == 1);
        static_assert(NPerXdl == 32);
        static_assert(D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock,
                      "D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock");