add type check

b76c8e62 · letaoqin · 4e6fd810 · b76c8e62 · b76c8e62 · b76c8e62
Commit b76c8e62 authored Oct 11, 2023 by letaoqin
8 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
@@ -1293,6 +1293,16 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
            return false;
        }
+        // saving dQ data with atomic_add instruction, so KzRaw must be a multiple of 2
+        if constexpr(is_same<OutputDataType, half_t>::value ||
+                     is_same<OutputDataType, bhalf_t>::value)
+        {
+            if(KzRaw % 2 != 0)
+            {
+                std::cout << "K_q must be a multiple of 2" << std::endl;
+                return false;
+            }
+        }
        // Check vector load/store requirement
        const auto a_stride_lowest =
            ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0];

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
@@ -1325,6 +1325,16 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
            return false;
        }
+        // saving dQ data with atomic_add instruction, so KzRaw must be a multiple of 2
+        if constexpr(is_same<OutputDataType, half_t>::value ||
+                     is_same<OutputDataType, bhalf_t>::value)
+        {
+            if(KzRaw % 2 != 0)
+            {
+                std::cout << "K_q must be a multiple of 2" << std::endl;
+                return false;
+            }
+        }
        // Check vector load/store requirement
        const auto a_stride_lowest =
            ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0];

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -1153,10 +1153,14 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
        }
        // saving dQ data with atomic_add instruction, so KzRaw must be a multiple of 2
-        if(KzRaw % 2 != 0)
+        if constexpr(is_same<OutputDataType, half_t>::value ||
+                     is_same<OutputDataType, bhalf_t>::value)
        {
-            std::cout << "K_q must be a multiple of 2" << std::endl;
+            if(KzRaw % 2 != 0)
-            return false;
+            {
+                std::cout << "K_q must be a multiple of 2" << std::endl;
+                return false;
+            }
        }
        // Check vector load/store requirement

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -1190,10 +1190,14 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
        }
        // saving dQ data with atomic_add instruction, so KzRaw must be a multiple of 2
-        if(KzRaw % 2 != 0)
+        if constexpr(is_same<OutputDataType, half_t>::value ||
+                     is_same<OutputDataType, bhalf_t>::value)
        {
-            std::cout << "K_q must be a multiple of 2" << std::endl;
+            if(KzRaw % 2 != 0)
-            return false;
+            {
+                std::cout << "K_q must be a multiple of 2" << std::endl;
+                return false;
+            }
        }
        // Check vector load/store requirement

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
@@ -1336,10 +1336,14 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
            }
            // saving dQ data with atomic_add instruction, so KzRaw must be a multiple of 2
-            if(KzRaw % 2 != 0)
+            if constexpr(is_same<OutputDataType, half_t>::value ||
+                         is_same<OutputDataType, bhalf_t>::value)
            {
-                std::cout << "K_q must be a multiple of 2" << std::endl;
+                if(KzRaw % 2 != 0)
-                return false;
+                {
+                    std::cout << "K_q must be a multiple of 2" << std::endl;
+                    return false;
+                }
            }
            // Check vector load/store requirement

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
@@ -1408,10 +1408,14 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
            }
            // saving dQ data with atomic_add instruction, so KzRaw must be a multiple of 2
-            if(KzRaw % 2 != 0)
+            if constexpr(is_same<OutputDataType, half_t>::value ||
+                         is_same<OutputDataType, bhalf_t>::value)
            {
-                std::cout << "K_q must be a multiple of 2" << std::endl;
+                if(KzRaw % 2 != 0)
-                return false;
+                {
+                    std::cout << "K_q must be a multiple of 2" << std::endl;
+                    return false;
+                }
            }
            // Check vector load/store requirement

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -1182,10 +1182,14 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            }
            // saving dQ data with atomic_add instruction, so KzRaw must be a multiple of 2
-            if(KzRaw % 2 != 0)
+            if constexpr(is_same<OutputDataType, half_t>::value ||
+                         is_same<OutputDataType, bhalf_t>::value)
            {
-                std::cout << "K_q must be a multiple of 2" << std::endl;
+                if(KzRaw % 2 != 0)
-                return false;
+                {
+                    std::cout << "K_q must be a multiple of 2" << std::endl;
+                    return false;
+                }
            }
            // Check vector load/store requirement

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -1253,10 +1253,14 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            }
            // saving dQ data with atomic_add instruction, so KzRaw must be a multiple of 2
-            if(KzRaw % 2 != 0)
+            if constexpr(is_same<OutputDataType, half_t>::value ||
+                         is_same<OutputDataType, bhalf_t>::value)
            {
-                std::cout << "K_q must be a multiple of 2" << std::endl;
+                if(KzRaw % 2 != 0)
-                return false;
+                {
+                    std::cout << "K_q must be a multiple of 2" << std::endl;
+                    return false;
+                }
            }
            // Check vector load/store requirement