add OutputDataType&Deterministic for pt1q1

35b59ac6 · danyao12 · ff6e303d · 35b59ac6 · 35b59ac6 · 35b59ac6
Commit 35b59ac6 authored May 18, 2023 by danyao12
3 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v3.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v3.cpp
@@ -55,6 +55,7 @@ using F16  = ck::half_t;
 using BF16  = ck::bhalf_t;
 using F32   = float;
 using U16   = unsigned short;
+using INT32 = int32_t;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using Scale       = ck::tensor_operation::element_wise::Scale;
@@ -62,12 +63,13 @@ using Scale       = ck::tensor_operation::element_wise::Scale;
 using QKVElementOp = PassThrough;
 using YElementOp   = PassThrough;

-using DataType         = F16;
+using InputDataType    = F16;
+using OutputDataType   = F16;
 using GemmDataType     = F16;
 using AccDataType      = F32;
 using ShuffleDataType  = F32;
 using LSEDataType      = F32;
-using ZDataType        = U16;
+using ZDataType        = U16; // INT32
 using Acc0BiasDataType = ck::Tuple<>;
 using Acc1BiasDataType = ck::Tuple<>;

@@ -76,6 +78,9 @@ static constexpr ck::index_t NumDimM = 1;
 static constexpr ck::index_t NumDimN = 1;
 static constexpr ck::index_t NumDimK = 1;
 static constexpr ck::index_t NumDimO = 1;
+// When OutputDataType == F32,      CShuffleBlockTransferScalarPerVector_NPerBlock = 4
+// When OutputDataType == F16/BF16, CShuffleBlockTransferScalarPerVector_NPerBlock = 8
+static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock = 8;

 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
 #if USING_MASK
@@ -90,6 +95,7 @@ static constexpr auto TensorSpecQ = ck::tensor_operation::device::TensorSpeciali
 static constexpr auto TensorSpecK   = ck::tensor_operation::device::TensorSpecialization::Default;
 static constexpr auto TensorSpecV   = ck::tensor_operation::device::TensorSpecialization::Default;
 static constexpr auto TensorSpecY   = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr bool Deterministic = false;

 // DIM should be a multiple of 8.
 // If      DIM <= 32 , ues prototype1 1st template.
@@ -103,7 +109,8 @@ using DeviceGemmInstance =
        NumDimN,
        NumDimK,
        NumDimO,
-        DataType,
+        InputDataType,
+        OutputDataType,
        GemmDataType,
        ZDataType,
        LSEDataType,
@@ -161,8 +168,9 @@ using DeviceGemmInstance =
        1,              // CShuffleMXdlPerWavePerShuffle
        1,              // CShuffleNXdlPerWavePerShuffle
        S<1, 64, 1, 4>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
-        MaskingSpec>;   // MaskingSpecialization
+        CShuffleBlockTransferScalarPerVector_NPerBlock, // CShuffleBlockTransferScalarPerVector_NPerBlock
+        MaskingSpec,                                    // MaskingSpecialization
+        Deterministic>;
 #elif(DIM <= 64)
 using DeviceGemmInstance =
    ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1<
@@ -171,7 +179,8 @@ using DeviceGemmInstance =
        NumDimN,
        NumDimK,
        NumDimO,
-        DataType,
+        InputDataType,
+        OutputDataType,
        GemmDataType,
        ZDataType,
        LSEDataType,
@@ -229,8 +238,9 @@ using DeviceGemmInstance =
        1,              // CShuffleMXdlPerWavePerShuffle
        1,              // CShuffleNXdlPerWavePerShuffle
        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
-        MaskingSpec>;   // MaskingSpecialization
+        CShuffleBlockTransferScalarPerVector_NPerBlock, // CShuffleBlockTransferScalarPerVector_NPerBlock
+        MaskingSpec,                                    // MaskingSpecialization
+        Deterministic>;

 // using DeviceGemmInstance =
 //     ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2<
@@ -239,7 +249,8 @@ using DeviceGemmInstance =
 //         NumDimN,
 //         NumDimK,
 //         NumDimO,
-//         DataType,
+//         InputDataType,
+//         OutputDataType,
 //         GemmDataType,
 //         ZDataType,
 //         LSEDataType,
@@ -297,8 +308,9 @@ using DeviceGemmInstance =
 //         1,              // CShuffleMXdlPerWavePerShuffle
 //         2,              // CShuffleNXdlPerWavePerShuffle
 //         S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-//         8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
-//         MaskingSpec>;   // MaskingSpecialization
+//         CShuffleBlockTransferScalarPerVector_NPerBlock,
+//         MaskingSpec,
+//         Deterministic>;
 #elif(DIM <= 128)
 using DeviceGemmInstance =
    ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2<
@@ -307,7 +319,8 @@ using DeviceGemmInstance =
        NumDimN,
        NumDimK,
        NumDimO,
-        DataType,
+        InputDataType,
+        OutputDataType,
        GemmDataType,
        ZDataType,
        LSEDataType,
@@ -365,14 +378,15 @@ using DeviceGemmInstance =
        1,              // CShuffleMXdlPerWavePerShuffle
        4,              // CShuffleNXdlPerWavePerShuffle
        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
-        MaskingSpec>;   // MaskingSpecialization
+        CShuffleBlockTransferScalarPerVector_NPerBlock, // CShuffleBlockTransferScalarPerVector_NPerBlock
+        MaskingSpec,                                    // MaskingSpecialization
+        Deterministic>;
 #endif

 // Ref Gemm0: S = alpha * Q * K^T
 // fp16 in, fp32 out
-using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<DataType,
-                                                                                DataType,
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<InputDataType,
+                                                                                InputDataType,
                                                                                AccDataType,
                                                                                AccDataType,
                                                                                PassThrough,
@@ -382,13 +396,13 @@ using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
 // Ref Softmax: P = Softmax(S)
 // fp32 in, fp16 out
 using ReferenceSoftmaxInstance =
-    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, DataType, AccDataType>;
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, InputDataType, AccDataType>;

 // Ref Gemm1: Y = P * V
 // fp16 in, fp16 out
-using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<DataType,
-                                                                                DataType,
-                                                                                DataType,
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<InputDataType,
+                                                                                InputDataType,
+                                                                                InputDataType,
                                                                                AccDataType,
                                                                                PassThrough,
                                                                                PassThrough,
@@ -396,16 +410,25 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<

 // Ref Gemm for backward pass
 // fp16 in, fp16 out
-using ReferenceGemmGradInstance = ck::tensor_operation::host::ReferenceBatchedGemm<DataType,
-                                                                                   DataType,
-                                                                                   DataType,
+using ReferenceGemm0GradInstance = ck::tensor_operation::host::ReferenceBatchedGemm<InputDataType,
+                                                                                    InputDataType,
+                                                                                    InputDataType,
                                                                                    AccDataType,
                                                                                    PassThrough,
                                                                                    PassThrough,
                                                                                    Scale>;
+
+using ReferenceGemm1GradInstance = ck::tensor_operation::host::ReferenceBatchedGemm<InputDataType,
+                                                                                    InputDataType,
+                                                                                    OutputDataType,
+                                                                                    AccDataType,
+                                                                                    PassThrough,
+                                                                                    PassThrough,
+                                                                                    Scale>;
+
 // Ref dropout
 using ReferenceDropoutInstance =
-    ck::tensor_operation::host::ReferenceDropout<ushort, DataType, DataType>;
+    ck::tensor_operation::host::ReferenceDropout<ZDataType, InputDataType, InputDataType>;

 template <typename TensorQ,
          typename TensorK,
@@ -425,7 +448,7 @@ void run_attention_fwd_host(const TensorQ& q_g_m_k,
                            TensorLSE& lse_g_m,
                            TensorP& p_drop_g_m_n,
                            TensorZ& z_g_m_n,
-                            ushort p_dropout_in_16bits,
+                            ZDataType p_dropout_in_16bits,
                            float rp_dropout)
 {
    // S = alpha * Q * K^T
@@ -532,7 +555,7 @@ int run(int argc, char* argv[])
    }

    float p_dropout               = 1 - p_drop;
-    uint16_t p_dropout_in_16bits = uint16_t(std::floor(p_dropout * 65535.0));
+    ZDataType p_dropout_in_16bits = ZDataType(std::floor(p_dropout * 65535.0));
    float rp_dropout              = 1.0 / p_dropout;
    float alpha                   = 1.f / std::sqrt(K);

@@ -592,12 +615,12 @@ int run(int argc, char* argv[])
    std::vector<ck::index_t> lse_gs_ms_lengths{G0, G1, M};
    std::vector<ck::index_t> lse_gs_ms_strides{G1 * M, M, 1}; // LSE layout [G0, G1, M]

-    Tensor<DataType> q_gs_ms_ks(q_gs_ms_ks_lengths, q_gs_ms_ks_strides);
-    Tensor<DataType> k_gs_ns_ks(k_gs_ns_ks_lengths, k_gs_ns_ks_strides);
+    Tensor<InputDataType> q_gs_ms_ks(q_gs_ms_ks_lengths, q_gs_ms_ks_strides);
+    Tensor<InputDataType> k_gs_ns_ks(k_gs_ns_ks_lengths, k_gs_ns_ks_strides);
    Tensor<ZDataType> z_gs_ms_ns(z_gs_ms_ns_lengths, z_gs_ms_ns_strides);
-    Tensor<DataType> v_gs_os_ns(v_gs_os_ns_lengths, v_gs_os_ns_strides);
-    Tensor<DataType> y_gs_ms_os(y_gs_ms_os_lengths, y_gs_ms_os_strides);
-    Tensor<DataType> ygrad_gs_ms_os(y_gs_ms_os_lengths, y_gs_ms_os_strides);
+    Tensor<InputDataType> v_gs_os_ns(v_gs_os_ns_lengths, v_gs_os_ns_strides);
+    Tensor<InputDataType> y_gs_ms_os(y_gs_ms_os_lengths, y_gs_ms_os_strides);
+    Tensor<InputDataType> ygrad_gs_ms_os(y_gs_ms_os_lengths, y_gs_ms_os_strides);
    Tensor<LSEDataType> lse_gs_ms(lse_gs_ms_lengths, lse_gs_ms_strides);

    std::cout << "q_gs_ms_ks: " << q_gs_ms_ks.mDesc << std::endl;
@@ -607,45 +630,45 @@ int run(int argc, char* argv[])
    std::cout << "y_gs_ms_os: " << y_gs_ms_os.mDesc << std::endl;
    std::cout << "lse_gs_ms_os: " << lse_gs_ms.mDesc << std::endl;

-    z_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<DataType>{0});
+    z_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<InputDataType>{0});
    switch(init_method)
    {
    case 0: break;
    case 1:
-        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<DataType>{-2, 2});
-        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<DataType>{-2, 2});
-        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<DataType>{-2, 2});
-        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_2<DataType>{-2, 2});
+        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<InputDataType>{-2, 2});
+        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<InputDataType>{-2, 2});
+        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<InputDataType>{-2, 2});
+        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_2<InputDataType>{-2, 2});
        break;
    case 2:
-        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<DataType>{0.0, 1.0});
-        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<DataType>{0.0, 1.0});
-        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_3<DataType>{-0.5, 0.5});
-        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_3<DataType>{-0.5, 0.5});
+        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<InputDataType>{0.0, 1.0});
+        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<InputDataType>{0.0, 1.0});
+        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_3<InputDataType>{-0.5, 0.5});
+        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_3<InputDataType>{-0.5, 0.5});
        break;
    case 3:
-        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5});
-        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<DataType>{});
-        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<DataType>{});
-        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_Diagonal<DataType>{});
+        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<InputDataType>{-5, 5});
+        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<InputDataType>{});
+        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<InputDataType>{});
+        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_Diagonal<InputDataType>{});
        break;
    case 4:
-        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<DataType>{1});
-        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1<DataType>{1});
-        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_1<DataType>{1});
-        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_1<DataType>{1});
+        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<InputDataType>{1});
+        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1<InputDataType>{1});
+        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_1<InputDataType>{1});
+        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_1<InputDataType>{1});
        break;
    case 5:
-        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<DataType>{1});
-        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<DataType>{});
-        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<DataType>{});
+        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<InputDataType>{1});
+        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<InputDataType>{});
+        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<InputDataType>{});
        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); // dy[g0, g1, m, o]
        // dO dot O = [0; 1; 2; ...]
        break;
    case 6:
-        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<DataType>{1});
-        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<DataType>{});
-        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<DataType>{});
+        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<InputDataType>{1});
+        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<InputDataType>{});
+        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<InputDataType>{});
        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_Sequential<3>{}); // dy[g0, g1, m, o]
        // assume mnko = 256
        // P = softmax(QK) = 0.0039 * ones
@@ -656,10 +679,10 @@ int run(int argc, char* argv[])
        //
        break;
    default:
-        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<DataType>{1});
-        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<DataType>{});
-        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<DataType>{});
-        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_1<DataType>{1}); // dy[g0, g1, m, o]
+        q_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<InputDataType>{1});
+        k_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<InputDataType>{});
+        v_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<InputDataType>{});
+        ygrad_gs_ms_os.GenerateTensorValue(GeneratorTensor_1<InputDataType>{1}); // dy[g0, g1, m, o]
        // assume mnko = 256
        // P = softmax(QK) = 0.0039 * ones
        // O = P V = 0.0039 * ones
@@ -670,14 +693,14 @@ int run(int argc, char* argv[])
        //    = 0
    }

-    Tensor<DataType> q_g_m_k({BatchCount, M, K});
-    Tensor<DataType> k_g_n_k({BatchCount, N, K});
+    Tensor<InputDataType> q_g_m_k({BatchCount, M, K});
+    Tensor<InputDataType> k_g_n_k({BatchCount, N, K});
    Tensor<ZDataType> z_g_m_n({BatchCount, M, N});
-    Tensor<DataType> v_g_n_o({BatchCount, N, O});
+    Tensor<InputDataType> v_g_n_o({BatchCount, N, O});
    Tensor<AccDataType> s_g_m_n({BatchCount, M, N});
-    Tensor<DataType> p_g_m_n({BatchCount, M, N});
-    Tensor<DataType> p_drop_g_m_n({BatchCount, M, N});
-    Tensor<DataType> y_g_m_o({BatchCount, M, O});
+    Tensor<InputDataType> p_g_m_n({BatchCount, M, N});
+    Tensor<InputDataType> p_drop_g_m_n({BatchCount, M, N});
+    Tensor<InputDataType> y_g_m_o({BatchCount, M, O});
    Tensor<LSEDataType> lse_g_m({BatchCount, M});

    q_gs_ms_ks.ForEach(
@@ -688,16 +711,16 @@ int run(int argc, char* argv[])
        [&](auto& self, auto idx) { v_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); });

    // qkv gradients have the same descriptor as with qkv
-    DeviceMem q_device_buf(sizeof(DataType) * q_gs_ms_ks.mDesc.GetElementSpaceSize());
-    DeviceMem k_device_buf(sizeof(DataType) * k_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem q_device_buf(sizeof(InputDataType) * q_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem k_device_buf(sizeof(InputDataType) * k_gs_ns_ks.mDesc.GetElementSpaceSize());
    DeviceMem z_device_buf(sizeof(ZDataType) * z_gs_ms_ns.mDesc.GetElementSpaceSize());
-    DeviceMem v_device_buf(sizeof(DataType) * v_gs_os_ns.mDesc.GetElementSpaceSize());
-    DeviceMem y_device_buf(sizeof(DataType) * y_gs_ms_os.mDesc.GetElementSpaceSize());
+    DeviceMem v_device_buf(sizeof(InputDataType) * v_gs_os_ns.mDesc.GetElementSpaceSize());
+    DeviceMem y_device_buf(sizeof(InputDataType) * y_gs_ms_os.mDesc.GetElementSpaceSize());
    DeviceMem lse_device_buf(sizeof(LSEDataType) * lse_gs_ms.mDesc.GetElementSpaceSize());
-    DeviceMem qgrad_device_buf(sizeof(DataType) * q_gs_ms_ks.mDesc.GetElementSpaceSize());
-    DeviceMem kgrad_device_buf(sizeof(DataType) * k_gs_ns_ks.mDesc.GetElementSpaceSize());
-    DeviceMem vgrad_device_buf(sizeof(DataType) * v_gs_os_ns.mDesc.GetElementSpaceSize());
-    DeviceMem ygrad_device_buf(sizeof(DataType) * y_gs_ms_os.mDesc.GetElementSpaceSize());
+    DeviceMem qgrad_device_buf(sizeof(OutputDataType) * q_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem kgrad_device_buf(sizeof(OutputDataType) * k_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem vgrad_device_buf(sizeof(OutputDataType) * v_gs_os_ns.mDesc.GetElementSpaceSize());
+    DeviceMem ygrad_device_buf(sizeof(InputDataType) * y_gs_ms_os.mDesc.GetElementSpaceSize());

    q_device_buf.ToDevice(q_gs_ms_ks.mData.data());
    k_device_buf.ToDevice(k_gs_ns_ks.mData.data());
@@ -710,16 +733,16 @@ int run(int argc, char* argv[])
    // get z matrix
    {
        auto argument = gemm.MakeArgument(
-            static_cast<DataType*>(q_device_buf.GetDeviceBuffer()),
-            static_cast<DataType*>(k_device_buf.GetDeviceBuffer()),
+            static_cast<InputDataType*>(q_device_buf.GetDeviceBuffer()),
+            static_cast<InputDataType*>(k_device_buf.GetDeviceBuffer()),
            static_cast<ZDataType*>(z_device_buf.GetDeviceBuffer()),
-            static_cast<DataType*>(v_device_buf.GetDeviceBuffer()),
-            static_cast<DataType*>(y_device_buf.GetDeviceBuffer()),
+            static_cast<InputDataType*>(v_device_buf.GetDeviceBuffer()),
+            static_cast<InputDataType*>(y_device_buf.GetDeviceBuffer()),
            static_cast<LSEDataType*>(lse_device_buf.GetDeviceBuffer()),
-            static_cast<DataType*>(ygrad_device_buf.GetDeviceBuffer()),
-            static_cast<DataType*>(qgrad_device_buf.GetDeviceBuffer()),
-            static_cast<DataType*>(kgrad_device_buf.GetDeviceBuffer()),
-            static_cast<DataType*>(vgrad_device_buf.GetDeviceBuffer()),
+            static_cast<InputDataType*>(ygrad_device_buf.GetDeviceBuffer()),
+            static_cast<OutputDataType*>(qgrad_device_buf.GetDeviceBuffer()),
+            static_cast<OutputDataType*>(kgrad_device_buf.GetDeviceBuffer()),
+            static_cast<OutputDataType*>(vgrad_device_buf.GetDeviceBuffer()),
            {}, // std::array<void*, 1> p_acc0_biases;
            {}, // std::array<void*, 1> p_acc1_biases;
            q_gs_ms_ks_lengths,
@@ -755,16 +778,16 @@ int run(int argc, char* argv[])
    }
    // not need output z matrix
    auto argument = gemm.MakeArgument(
-        static_cast<DataType*>(q_device_buf.GetDeviceBuffer()),
-        static_cast<DataType*>(k_device_buf.GetDeviceBuffer()),
+        static_cast<InputDataType*>(q_device_buf.GetDeviceBuffer()),
+        static_cast<InputDataType*>(k_device_buf.GetDeviceBuffer()),
        static_cast<ZDataType*>(nullptr), // set to nullptr
-        static_cast<DataType*>(v_device_buf.GetDeviceBuffer()),
-        static_cast<DataType*>(y_device_buf.GetDeviceBuffer()),
+        static_cast<InputDataType*>(v_device_buf.GetDeviceBuffer()),
+        static_cast<InputDataType*>(y_device_buf.GetDeviceBuffer()),
        static_cast<LSEDataType*>(lse_device_buf.GetDeviceBuffer()),
-        static_cast<DataType*>(ygrad_device_buf.GetDeviceBuffer()),
-        static_cast<DataType*>(qgrad_device_buf.GetDeviceBuffer()),
-        static_cast<DataType*>(kgrad_device_buf.GetDeviceBuffer()),
-        static_cast<DataType*>(vgrad_device_buf.GetDeviceBuffer()),
+        static_cast<InputDataType*>(ygrad_device_buf.GetDeviceBuffer()),
+        static_cast<OutputDataType*>(qgrad_device_buf.GetDeviceBuffer()),
+        static_cast<OutputDataType*>(kgrad_device_buf.GetDeviceBuffer()),
+        static_cast<OutputDataType*>(vgrad_device_buf.GetDeviceBuffer()),
        {}, // std::array<void*, 1> p_acc0_biases;
        {}, // std::array<void*, 1> p_acc1_biases;
        q_gs_ms_ks_lengths,
@@ -799,9 +822,12 @@ int run(int argc, char* argv[])
    // 3x MNK + 2x MNO
    std::size_t flop = (size_t(3) * M * N * K + size_t(2) * M * N * O) * 2 * BatchCount;
    // Q/K/V/Y, dQ/dK/dV/dY, LSE
-    std::size_t num_btype = (sizeof(DataType) * M * K + sizeof(DataType) * K * N +
-                             sizeof(DataType) * N * O + sizeof(DataType) * M * O) *
-                                size_t(2) * BatchCount +
+    std::size_t num_btype =
+        (sizeof(InputDataType) * M * K + sizeof(InputDataType) * K * N +
+         sizeof(InputDataType) * N * O + sizeof(InputDataType) * M * O * size_t(2) +
+         sizeof(OutputDataType) * M * K + sizeof(OutputDataType) * K * N +
+         sizeof(OutputDataType) * N * O) *
+            BatchCount +
        sizeof(LSEDataType) * M * BatchCount;

    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -845,14 +871,14 @@ int run(int argc, char* argv[])
        qgrad_device_buf.SetZero();
        invoker.Run(argument, StreamConfig{nullptr, false});

-        Tensor<DataType> qgrad_g_m_k({BatchCount, M, K});
-        Tensor<DataType> kgrad_g_n_k({BatchCount, N, K});
-        Tensor<DataType> vgrad_g_n_o({BatchCount, N, O});
-        Tensor<DataType> sgrad_g_m_n({BatchCount, M, N});
-        Tensor<DataType> pgrad_g_m_n({BatchCount, M, N});
-        Tensor<DataType> pgrad_drop_g_m_n({BatchCount, M, N});
-        Tensor<DataType> ygrad_g_m_o({BatchCount, M, O});
-        Tensor<DataType> ygrad_dot_y_g_m({BatchCount, M});
+        Tensor<OutputDataType> qgrad_g_m_k({BatchCount, M, K});
+        Tensor<OutputDataType> kgrad_g_n_k({BatchCount, N, K});
+        Tensor<OutputDataType> vgrad_g_n_o({BatchCount, N, O});
+        Tensor<InputDataType> sgrad_g_m_n({BatchCount, M, N});
+        Tensor<InputDataType> pgrad_g_m_n({BatchCount, M, N});
+        Tensor<InputDataType> pgrad_drop_g_m_n({BatchCount, M, N});
+        Tensor<InputDataType> ygrad_g_m_o({BatchCount, M, O});
+        Tensor<InputDataType> ygrad_dot_y_g_m({BatchCount, M});

        ygrad_gs_ms_os.ForEach([&](auto& self, auto idx) {
            ygrad_g_m_o(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx);
@@ -868,13 +894,16 @@ int run(int argc, char* argv[])
 #endif

        // Gradients
-        auto ref_gemm_grad         = ReferenceGemmGradInstance{};
-        auto ref_gemm_grad_invoker = ref_gemm_grad.MakeInvoker();
-        using RefGemmGradArg       = ReferenceGemmGradInstance::Argument;
+        auto ref_gemm0_grad         = ReferenceGemm0GradInstance{};
+        auto ref_gemm0_grad_invoker = ref_gemm0_grad.MakeInvoker();
+        using RefGemm0GradArg       = ReferenceGemm0GradInstance::Argument;
+        auto ref_gemm1_grad         = ReferenceGemm1GradInstance{};
+        auto ref_gemm1_grad_invoker = ref_gemm1_grad.MakeInvoker();
+        using RefGemm1GradArg       = ReferenceGemm1GradInstance::Argument;

        // dP_dropout = dY * V^T
        auto v_g_o_n = v_g_n_o.Transpose({0, 2, 1});
-        ref_gemm_grad_invoker.Run(RefGemmGradArg{
+        ref_gemm0_grad_invoker.Run(RefGemm0GradArg{
            ygrad_g_m_o, v_g_o_n, pgrad_drop_g_m_n, PassThrough{}, PassThrough{}, Scale{1.f}});
 #if PRINT_HOST
        {
@@ -901,7 +930,7 @@ int run(int argc, char* argv[])
                ygrad_dot_y += ck::type_convert<AccDataType>(ygrad_g_m_o(idx_gmo)) *
                               ck::type_convert<AccDataType>(y_g_m_o(idx_gmo));
            }
-            self(idx_gmn) = ck::type_convert<DataType>(
+            self(idx_gmn) = ck::type_convert<InputDataType>(
                ck::type_convert<AccDataType>(p_g_m_n(idx_gmn)) *
                (ck::type_convert<AccDataType>(pgrad_g_m_n(idx_gmn)) - ygrad_dot_y));
        });
@@ -917,7 +946,7 @@ int run(int argc, char* argv[])
 #endif
        // dV = P_drop^T * dY
        auto p_drop_g_n_m = p_drop_g_m_n.Transpose({0, 2, 1});
-        ref_gemm_grad_invoker.Run(RefGemmGradArg{
+        ref_gemm1_grad_invoker.Run(RefGemm1GradArg{
            p_drop_g_n_m, ygrad_g_m_o, vgrad_g_n_o, PassThrough{}, PassThrough{}, Scale{1.0f}});
 #if PRINT_HOST
        {
@@ -929,7 +958,7 @@ int run(int argc, char* argv[])
 #endif

        // dQ = alpha * dS * K
-        ref_gemm_grad_invoker.Run(RefGemmGradArg{
+        ref_gemm1_grad_invoker.Run(RefGemm1GradArg{
            sgrad_g_m_n, k_g_n_k, qgrad_g_m_k, PassThrough{}, PassThrough{}, Scale{alpha}});
 #if PRINT_HOST
        {
@@ -942,7 +971,7 @@ int run(int argc, char* argv[])

        // dK = alpha * dS^T * Q
        auto sgrad_g_n_m = sgrad_g_m_n.Transpose({0, 2, 1});
-        ref_gemm_grad_invoker.Run(RefGemmGradArg{
+        ref_gemm1_grad_invoker.Run(RefGemm1GradArg{
            sgrad_g_n_m, q_g_m_k, kgrad_g_n_k, PassThrough{}, PassThrough{}, Scale{alpha}});
 #if PRINT_HOST
        {
@@ -953,13 +982,13 @@ int run(int argc, char* argv[])
        }
 #endif

-        Tensor<DataType> qgrad_gs_ms_ks_host_result(q_gs_ms_ks_lengths, q_gs_ms_ks_strides);
-        Tensor<DataType> kgrad_gs_ns_ks_host_result(k_gs_ns_ks_lengths, k_gs_ns_ks_strides);
-        Tensor<DataType> vgrad_gs_os_ns_host_result(v_gs_os_ns_lengths, v_gs_os_ns_strides);
+        Tensor<OutputDataType> qgrad_gs_ms_ks_host_result(q_gs_ms_ks_lengths, q_gs_ms_ks_strides);
+        Tensor<OutputDataType> kgrad_gs_ns_ks_host_result(k_gs_ns_ks_lengths, k_gs_ns_ks_strides);
+        Tensor<OutputDataType> vgrad_gs_os_ns_host_result(v_gs_os_ns_lengths, v_gs_os_ns_strides);

-        Tensor<DataType> qgrad_gs_ms_ks_device_result(q_gs_ms_ks_lengths, q_gs_ms_ks_strides);
-        Tensor<DataType> kgrad_gs_ns_ks_device_result(k_gs_ns_ks_lengths, k_gs_ns_ks_strides);
-        Tensor<DataType> vgrad_gs_os_ns_device_result(v_gs_os_ns_lengths, v_gs_os_ns_strides);
+        Tensor<OutputDataType> qgrad_gs_ms_ks_device_result(q_gs_ms_ks_lengths, q_gs_ms_ks_strides);
+        Tensor<OutputDataType> kgrad_gs_ns_ks_device_result(k_gs_ns_ks_lengths, k_gs_ns_ks_strides);
+        Tensor<OutputDataType> vgrad_gs_os_ns_device_result(v_gs_os_ns_lengths, v_gs_os_ns_strides);

        qgrad_device_buf.FromDevice(qgrad_gs_ms_ks_device_result.mData.data());
        kgrad_device_buf.FromDevice(kgrad_gs_ns_ks_device_result.mData.data());

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle_v3.hpp
@@ -28,7 +28,8 @@ namespace tensor_operation {
 namespace device {

 template <typename GridwiseGemm,
-          typename DataType,
+          typename InputDataType,
+          typename OutputDataType,
          typename ZDataType,
          typename LSEDataType,
          typename AElementwiseOperation,
@@ -46,22 +47,23 @@ template <typename GridwiseGemm,
          typename Block2CTileMap,
          typename ComputeBasePtrOfStridedBatch,
          typename C0MatrixMask,
-          bool HasMainKBlockLoop>
+          bool HasMainKBlockLoop,
+          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, /*CK_MIN_BLOCK_PER_CU*/ 1)
 #endif
        kernel_batched_multihead_attention_backward_xdl_cshuffle_v1(
-            const DataType* __restrict__ p_a_grid,
-            const DataType* __restrict__ p_b_grid,
+            const InputDataType* __restrict__ p_a_grid,
+            const InputDataType* __restrict__ p_b_grid,
            ZDataType* __restrict__ p_z_grid,
-            const DataType* __restrict__ p_b1_grid,
-            const DataType* __restrict__ p_c_grid,
+            const InputDataType* __restrict__ p_b1_grid,
+            const InputDataType* __restrict__ p_c_grid,
            const LSEDataType* __restrict__ p_lse_grid,
-            const DataType* __restrict__ p_ygrad_grid,
-            DataType* __restrict__ p_qgrad_grid,
-            DataType* __restrict__ p_kgrad_grid,
-            DataType* __restrict__ p_vgrad_grid,
+            const InputDataType* __restrict__ p_ygrad_grid,
+            OutputDataType* __restrict__ p_qgrad_grid,
+            OutputDataType* __restrict__ p_kgrad_grid,
+            OutputDataType* __restrict__ p_vgrad_grid,
            const AElementwiseOperation a_element_op,
            const BElementwiseOperation b_element_op,
            const AccElementwiseOperation acc_element_op,
@@ -78,6 +80,7 @@ __global__ void
            const YGradGridDesc_O0_M_O1 ygrad_grid_desc_o0_m_o1,
            const Block2CTileMap block_2_ctile_map,
            const index_t batch_count,
+            const index_t nblock,
            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
            const C0MatrixMask c0_matrix_mask,
            const float p_drop,
@@ -109,6 +112,43 @@ __global__ void
    ck::philox ph(seed, global_thread_id, offset);
    ZDataType* z_matrix_ptr = (p_z_grid == nullptr ? nullptr : p_z_grid + z_batch_offset);

+    if constexpr(Deterministic)
+    {
+        for(index_t i = 0; i < nblock; i++)
+        {
+            GridwiseGemm::template Run<HasMainKBlockLoop>(
+                p_a_grid + a_batch_offset,
+                p_b_grid + b_batch_offset,
+                z_matrix_ptr,
+                p_b1_grid + b1_batch_offset,
+                p_c_grid + c_batch_offset,
+                p_lse_grid + lse_batch_offset,
+                p_ygrad_grid + c_batch_offset,
+                p_qgrad_grid + a_batch_offset,
+                p_kgrad_grid + b_batch_offset,
+                p_vgrad_grid + b1_batch_offset,
+                p_shared,
+                a_element_op,
+                b_element_op,
+                acc_element_op,
+                b1_element_op,
+                c_element_op,
+                a_grid_desc_ak0_m_ak1,
+                b_grid_desc_bk0_n_bk1,
+                c_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+                b1_grid_desc_bk0_n_bk1,
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+                lse_grid_desc_m,
+                ygrad_grid_desc_o0_m_o1,
+                block_2_ctile_map,
+                c0_matrix_mask,
+                p_drop,
+                ph,
+                i);
+        }
+    }
+    else
+    {
        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
                                                      p_b_grid + b_batch_offset,
                                                      z_matrix_ptr,
@@ -135,7 +175,9 @@ __global__ void
                                                      block_2_ctile_map,
                                                      c0_matrix_mask,
                                                      p_drop,
-                                                  ph);
+                                                      ph,
+                                                      0);
+    }
 #else
    ignore = p_a_grid;
    ignore = p_b_grid;
@@ -168,7 +210,8 @@ template <index_t NumDimG,
          index_t NumDimN,
          index_t NumDimK,
          index_t NumDimO, // NumDimGemm1N
-          typename DataType,
+          typename InputDataType,
+          typename OutputDataType,
          typename GemmDataType,
          typename ZDataType,
          typename LSEDataType,
@@ -228,6 +271,7 @@ template <index_t NumDimG,
          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
          MaskingSpecialization MaskingSpec,
+          bool Deterministic,
          LoopScheduler LoopSched = LoopScheduler::Default>
 struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
    : public BaseOperator // TODO inherit atten bwd op once API stablizes
@@ -594,7 +638,9 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1

    // GridwiseGemm
    using GridwiseGemm = GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1<
-        DataType, // TODO: distinguish A/B datatype
+        InputDataType, // TODO: distinguish A/B datatype
+        OutputDataType,
+        ZDataType,
        GemmDataType,
        GemmAccDataType,
        CShuffleDataType,
@@ -658,22 +704,23 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
        CShuffleBlockTransferScalarPerVector_NPerBlock,
        LoopSched,
        Transform::matrix_padder.PadN,
-        MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle>;
+        MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle,
+        Deterministic>;

    // Argument
    struct Argument : public BaseArgument
    {
        Argument(
-            const DataType* p_a_grid,
-            const DataType* p_b_grid,
+            const InputDataType* p_a_grid,
+            const InputDataType* p_b_grid,
            ZDataType* p_z_grid,
-            const DataType* p_b1_grid,
-            const DataType* p_c_grid, // for dS
+            const InputDataType* p_b1_grid,
+            const InputDataType* p_c_grid, // for dS
            const LSEDataType* p_lse_grid,
-            const DataType* p_ygrad_grid,
-            DataType* p_qgrad_grid,
-            DataType* p_kgrad_grid,
-            DataType* p_vgrad_grid,
+            const InputDataType* p_ygrad_grid,
+            OutputDataType* p_qgrad_grid,
+            OutputDataType* p_kgrad_grid,
+            OutputDataType* p_vgrad_grid,
            const std::array<void*, NumAcc0Bias> p_acc0_biases,
            const std::array<void*, NumAcc1Bias> p_acc1_biases,
            const std::vector<index_t>& a_gs_ms_ks_lengths,
@@ -817,16 +864,16 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
        }

        // pointers
-        const DataType* p_a_grid_;
-        const DataType* p_b_grid_;
+        const InputDataType* p_a_grid_;
+        const InputDataType* p_b_grid_;
        ZDataType* p_z_grid_;
-        const DataType* p_b1_grid_;
-        const DataType* p_c_grid_;
+        const InputDataType* p_b1_grid_;
+        const InputDataType* p_c_grid_;
        const LSEDataType* p_lse_grid_;
-        const DataType* p_ygrad_grid_;
-        DataType* p_qgrad_grid_;
-        DataType* p_kgrad_grid_;
-        DataType* p_vgrad_grid_;
+        const InputDataType* p_ygrad_grid_;
+        OutputDataType* p_qgrad_grid_;
+        OutputDataType* p_kgrad_grid_;
+        OutputDataType* p_vgrad_grid_;

        // tensor descriptor
        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
@@ -891,14 +938,17 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
            }

            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.k_grid_desc_n_k_) * arg.batch_count_;
+                (Deterministic ? 1
+                               : arg.block_2_ctile_map_.CalculateGridSize(arg.k_grid_desc_n_k_)) *
+                arg.batch_count_;

            float ave_time = 0;

            auto launch_kernel = [&](auto has_main_k_block_loop_) {
                const auto kernel = kernel_batched_multihead_attention_backward_xdl_cshuffle_v1<
                    GridwiseGemm,
-                    DataType,
+                    InputDataType,
+                    OutputDataType,
                    ZDataType,
                    LSEDataType,
                    AElementwiseOperation,
@@ -916,9 +966,11 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                    typename GridwiseGemm::DefaultBlock2CTileMap,
                    ComputeBasePtrOfStridedBatch,
                    C0MatrixMask,
-                    has_main_k_block_loop_>;
+                    has_main_k_block_loop_,
+                    Deterministic>;

-                return launch_and_time_kernel(stream_config,
+                return launch_and_time_kernel(
+                    stream_config,
                    kernel,
                    dim3(grid_size),
                    dim3(BlockSize),
@@ -947,6 +999,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                    arg.ygrad_grid_desc_o0_m_o1_,
                    arg.block_2_ctile_map_,
                    arg.batch_count_,
+                    arg.block_2_ctile_map_.CalculateGridSize(arg.k_grid_desc_n_k_),
                    arg.compute_base_ptr_of_batch_,
                    arg.c0_matrix_mask_,
                    arg.p_drop_,
@@ -1061,16 +1114,16 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
    }

    static auto MakeArgument(
-        const DataType* p_a,
-        const DataType* p_b,
+        const InputDataType* p_a,
+        const InputDataType* p_b,
        ZDataType* p_z,
-        const DataType* p_b1,
-        const DataType* p_c,
+        const InputDataType* p_b1,
+        const InputDataType* p_c,
        const LSEDataType* p_lse,
-        const DataType* p_ygrad_grid,
-        DataType* p_qgrad_grid,
-        DataType* p_kgrad_grid,
-        DataType* p_vgrad_grid,
+        const InputDataType* p_ygrad_grid,
+        OutputDataType* p_qgrad_grid,
+        OutputDataType* p_kgrad_grid,
+        OutputDataType* p_vgrad_grid,
        const std::array<void*, NumAcc0Bias> p_acc0_biases,
        const std::array<void*, NumAcc1Bias> p_acc1_biases,
        const std::vector<index_t>& a_gs_ms_ks_lengths,
@@ -1176,16 +1229,16 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
        float p_drop,
        std::tuple<unsigned long long, unsigned long long> seeds) // override
    {
-        return std::make_unique<Argument>(static_cast<const DataType*>(p_a),
-                                          static_cast<const DataType*>(p_b),
+        return std::make_unique<Argument>(static_cast<const InputDataType*>(p_a),
+                                          static_cast<const InputDataType*>(p_b),
                                          static_cast<ZDataType*>(p_z),
-                                          static_cast<const DataType*>(p_b1),
-                                          static_cast<const DataType*>(p_c),
+                                          static_cast<const InputDataType*>(p_b1),
+                                          static_cast<const InputDataType*>(p_c),
                                          static_cast<const LSEDataType*>(p_lse),
-                                          static_cast<const DataType*>(p_ygrad_grid),
-                                          static_cast<DataType*>(p_qgrad_grid),
-                                          static_cast<DataType*>(p_kgrad_grid),
-                                          static_cast<DataType*>(p_vgrad_grid),
+                                          static_cast<const InputDataType*>(p_ygrad_grid),
+                                          static_cast<OutputDataType*>(p_qgrad_grid),
+                                          static_cast<OutputDataType*>(p_kgrad_grid),
+                                          static_cast<OutputDataType*>(p_vgrad_grid),
                                          p_acc0_biases, // cast in struct Argument
                                          p_acc1_biases, // cast in struct Argument
                                          a_gs_ms_ks_lengths,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_pt3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_pt3.hpp
@@ -20,7 +20,9 @@

 namespace ck {

-template <typename DataType,
+template <typename InputDataType,
+          typename OutputDataType,
+          typename ZDataType,
          typename GemmDataType,
          typename FloatGemmAcc,
          typename FloatCShuffle,
@@ -85,6 +87,7 @@ template <typename DataType,
          LoopScheduler LoopSched,
          bool PadN,
          bool MaskOutUpperTriangle,
+          bool Deterministic,
          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
 {
@@ -439,7 +442,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                                                Sequence<AK0, MPerBlock, AK1>,
                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
                                                ABlockTransferThreadClusterArrangeOrder,
-                                                DataType,
+                                                InputDataType,
                                                GemmDataType,
                                                GridDesc_K0_M_K1,
                                                decltype(q_block_desc_k0_m_k1),
@@ -464,7 +467,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                                                Sequence<BK0, NPerBlock, BK1>,
                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
                                                BBlockTransferThreadClusterArrangeOrder,
-                                                DataType,
+                                                InputDataType,
                                                GemmDataType,
                                                GridDesc_K0_N_K1,
                                                decltype(k_block_desc_k0_n_k1),
@@ -489,7 +492,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                                                Sequence<BK0, NPerBlock, BK1>,
                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
                                                BBlockTransferThreadClusterArrangeOrder,
-                                                DataType,
+                                                InputDataType,
                                                GemmDataType,
                                                GridDesc_K0_N_K1,
                                                decltype(v_block_desc_k0_n_k1),
@@ -514,7 +517,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                                                Sequence<AK0, MPerBlock, AK1>,
                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
                                                ABlockTransferThreadClusterArrangeOrder,
-                                                DataType,
+                                                InputDataType,
                                                GemmDataType,
                                                GridDesc_K0_M_K1,
                                                decltype(ygrad_block_desc_k0_m_k1),
@@ -806,7 +809,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                  typename ElementwiseOp = tensor_operation::element_wise::PassThrough>
        using CBlockwiseCopy = ThreadwiseTensorSliceTransfer_v1r3<
            FloatGemmAcc,
-            DataType,
+            OutputDataType,
            decltype(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4),
            CGridDesc_M0_N0_M1_N1_M2_N2_N3_N4,
            ElementwiseOp,                                                // CElementwiseOperation
@@ -1117,7 +1120,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
    template <index_t BlockSize_, index_t BlockSliceLength_M_, index_t BlockSliceLength_O_>
    struct YDotYGrad_M_O_
    {
-        static constexpr index_t SrcScalarPerVector = 16 / sizeof(DataType);
+        static constexpr index_t SrcScalarPerVector = 16 / sizeof(InputDataType);
        static constexpr auto ThreadClusterLength_O =
            Number<BlockSliceLength_O_ / SrcScalarPerVector>{};
        static constexpr auto ThreadClusterLength_M = Number<BlockSize_ / ThreadClusterLength_O>{};
@@ -1234,16 +1237,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
              typename Block2CTileMap,
              typename C0MatrixMask,
              typename YGradGridDesc_O0_M_O1>
-    __device__ static void Run(const DataType* __restrict__ p_q_grid,
-                               const DataType* __restrict__ p_k_grid,
-                               unsigned short* __restrict__ p_z_grid,
-                               const DataType* __restrict__ p_v_grid,
-                               const DataType* __restrict__ p_y_grid,
+    __device__ static void Run(const InputDataType* __restrict__ p_q_grid,
+                               const InputDataType* __restrict__ p_k_grid,
+                               ZDataType* __restrict__ p_z_grid,
+                               const InputDataType* __restrict__ p_v_grid,
+                               const InputDataType* __restrict__ p_y_grid,
                               const FloatLSE* __restrict__ p_lse_grid,
-                               const DataType* __restrict__ p_ygrad_grid,
-                               DataType* __restrict__ p_qgrad_grid,
-                               DataType* __restrict__ p_kgrad_grid,
-                               DataType* __restrict__ p_vgrad_grid,
+                               const InputDataType* __restrict__ p_ygrad_grid,
+                               OutputDataType* __restrict__ p_qgrad_grid,
+                               OutputDataType* __restrict__ p_kgrad_grid,
+                               OutputDataType* __restrict__ p_vgrad_grid,
                               void* __restrict__ p_shared,
                               const AElementwiseOperation& a_element_op,
                               const BElementwiseOperation& b_element_op,
@@ -1262,7 +1265,8 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                               const Block2CTileMap& block_2_ctile_map,
                               const C0MatrixMask& c0_matrix_mask,
                               const float p_drop,
-                               ck::philox& ph)
+                               ck::philox& ph,
+                               const index_t block_idx_n)
    {
        const FloatGemmAcc p_dropout  = type_convert<FloatGemmAcc>(1.0f - p_drop);
        const FloatGemmAcc rp_dropout = type_convert<FloatGemmAcc>(1.0f / p_dropout);
@@ -1294,9 +1298,11 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
        const auto block_work_idx =
            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));

+        const index_t block_work_idx_n = Deterministic ? block_idx_n : block_work_idx[I0];
+
        // HACK: this force n_block_data_idx_on_grid into SGPR
        const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * NPerBlock);
+            __builtin_amdgcn_readfirstlane(block_work_idx_n * NPerBlock);

        // 6 GEMM operations are categorized into 3 buckets. SizeK == SizeO == head_dim
        // S_MNK / dP_MNO Gemm (Gemm0 rcr)
@@ -1551,7 +1557,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1

        auto z_thread_copy_vgpr_to_global = ThreadwiseTensorSliceTransfer_v1r3<
            ushort,
-            ushort,
+            ZDataType,
            decltype(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5),
            decltype(z_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5),
            tensor_operation::element_wise::PassThrough,
@@ -1571,7 +1577,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
            InMemoryDataOperationEnum::Set,
            1, // DstScalarStrideInVector
            true>{z_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-                  make_multi_index(block_work_idx[I0], // MBlockId
+                  make_multi_index(block_work_idx_n, // MBlockId
                                   0,                // NBlockId
                                   0,                // mrepeat
                                   0,                // nrepeat
@@ -1695,7 +1701,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1

        // performs for y
        auto y_threadwise_copy = ThreadwiseTensorSliceTransfer_v2<
-            DataType,
+            InputDataType,
            FloatGemmAcc,
            YGridDescriptor_MBlock_MPerBlock_OBlock_OPerBlock,
            decltype(y_thread_desc_m0_m1_o0_o1),
@@ -1758,6 +1764,11 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
        const index_t num_gemm0_m_block_outer_loop = q_grid_desc_k0_m_k1.GetLength(I1) / MPerBlock;
        constexpr index_t num_gemm1_k_block_inner_loop = NPerBlock / Gemm1KPerBlock;

+        if constexpr(Deterministic)
+        {
+            block_sync_lds();
+        }
+
        // Initialize dK&dV
        kgrad_thread_buf.Clear();
        vgrad_thread_buf.Clear();
@@ -2303,7 +2314,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
                FloatCShuffle,        // typename SrcData,
-                DataType,             // typename DstData,
+                OutputDataType,       // typename DstData,
                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
                decltype(vgrad_grid_desc_nblock_nperblock_oblock_operblock),
                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
@@ -2314,7 +2325,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
                 make_multi_index(0, 0, 0, 0),
                 vgrad_grid_desc_nblock_nperblock_oblock_operblock,
-                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
+                 make_multi_index(block_work_idx_n, 0, block_work_idx[I1], 0),
                 c_element_op};

            // shuffle: threadwise copy C from VGPR to LDS
@@ -2361,7 +2372,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
                FloatCShuffle,        // typename SrcData,
-                DataType,             // typename DstData,
+                OutputDataType,       // typename DstData,
                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
                decltype(kgrad_grid_desc_nblock_nperblock_oblock_operblock),
                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
@@ -2372,7 +2383,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
                 make_multi_index(0, 0, 0, 0),
                 kgrad_grid_desc_nblock_nperblock_oblock_operblock,
-                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
+                 make_multi_index(block_work_idx_n, 0, block_work_idx[I1], 0),
                 c_element_op};

            // space filling curve for threadwise C in VGPR