added profiler code for qloop v1&v2

ed26ceeb · guangzlu · d0cd6886 · ed26ceeb · ed26ceeb · ed26ceeb
Commit ed26ceeb authored Sep 08, 2023 by guangzlu
9 changed files
--- a/client_example/08_fused_attention/fused_attention_bwd_qloop_v1.cpp
+++ b/client_example/08_fused_attention/fused_attention_bwd_qloop_v1.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#define USING_MASK 0
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <fstream>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_mha_bwd_qloop_v1.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16   = ck::half_t;
+using BF16  = ck::bhalf_t;
+using F32   = float;
+using U16   = unsigned short;
+using INT32 = int32_t;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+using QKVElementOp = PassThrough;
+using YElementOp   = PassThrough;
+
+using InputDataType    = F16;
+using OutputDataType   = F16;
+using GemmDataType     = F16;
+using AccDataType      = F32;
+using ShuffleDataType  = F32;
+using LSEDataType      = F32;
+using ZDataType        = U16; // INT32
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+
+static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock = 8;
+
+#if USING_MASK
+static constexpr auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskUpperTriangleFromTopLeft;
+#else
+static constexpr auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+#endif
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    int init_method = 1;
+
+    ck::index_t M  = 512;
+    ck::index_t N  = 512;
+    ck::index_t K  = 64;
+    ck::index_t O  = 64;
+    ck::index_t G0 = 4;
+    ck::index_t G1 = 6;
+
+    bool input_permute  = false;
+    bool output_permute = false;
+
+    float p_drop                    = 0.0;
+    const unsigned long long seed   = 1;
+    const unsigned long long offset = 0;
+
+    float p_dropout               = 1 - p_drop;
+    ZDataType p_dropout_in_16bits = ZDataType(std::floor(p_dropout * 65535.0));
+    float rp_dropout              = 1.0 / p_dropout;
+    float alpha                   = 1.f / std::sqrt(K);
+
+    std::cout << "init_method: " << init_method << std::endl;
+    std::cout << "M: " << M << std::endl;
+    std::cout << "N: " << N << std::endl;
+    std::cout << "K: " << K << std::endl;
+    std::cout << "O: " << O << std::endl;
+    std::cout << "G0: " << G0 << std::endl;
+    std::cout << "G1: " << G1 << std::endl;
+    std::cout << "alpha: " << alpha << std::endl;
+    std::cout << "input_permute: " << input_permute << std::endl;
+    std::cout << "output_permute: " << output_permute << std::endl;
+    std::cout << "p_drop: " << p_drop << std::endl;
+    std::cout << "seed: " << seed << std::endl;
+    std::cout << "offset: " << offset << std::endl;
+
+    const ck::index_t BatchCount = G0 * G1;
+
+    std::vector<ck::index_t> q_gs_ms_ks_lengths{G0, G1, M, K};
+    std::vector<ck::index_t> q_gs_ms_ks_strides =
+        input_permute
+            ? std::vector<ck::index_t>{M * G1 * K, K, G1 * K, 1} // Q layout [G0, M, G1, K]
+            : std::vector<ck::index_t>{G1 * M * K, M * K, K, 1}; // Q layout [G0, G1, M, K]
+
+    std::vector<ck::index_t> k_gs_ns_ks_lengths{G0, G1, N, K};
+    std::vector<ck::index_t> k_gs_ns_ks_strides =
+        input_permute
+            ? std::vector<ck::index_t>{N * G1 * K, K, G1 * K, 1} // K layout [G0, N, G1, K]
+            : std::vector<ck::index_t>{G1 * N * K, N * K, K, 1}; // K layout [G0, G1, N, K]
+
+    std::vector<ck::index_t> v_gs_os_ns_lengths{G0, G1, O, N};
+    std::vector<ck::index_t> v_gs_os_ns_strides =
+        input_permute
+            ? std::vector<ck::index_t>{N * G1 * O, O, 1, G1 * O} // V layout [G0, N, G1, O]
+            : std::vector<ck::index_t>{G1 * N * O, N * O, 1, O}; // V layout [G0, G1, N, O]
+
+    std::vector<ck::index_t> y_gs_ms_os_lengths{G0, G1, M, O};
+    std::vector<ck::index_t> y_gs_ms_os_strides =
+        output_permute
+            ? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // Y layout [G0, M, G1, O]
+            : std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // Y layout [G0, G1, M, O]
+
+    std::vector<ck::index_t> z_gs_ms_ns_lengths{G0, G1, M, N};
+    std::vector<ck::index_t> z_gs_ms_ns_strides =
+        input_permute
+            ? std::vector<ck::index_t>{M * G1 * N, N, G1 * N, 1} // Z layout [G0, M, G1, N]
+            : std::vector<ck::index_t>{G1 * M * N, M * N, N, 1}; // Z layout [G0, G1, M, N]
+    // The softmax stat log-sum-exp (LSE) is used to speed up softmax calculation in backward pass
+    // Pi = exp(Si) / sum(exp(S0) + exp(S1) + ...)
+    //    = exp(Si) / exp(log(sum(exp() + ...)))
+    //    = exp(Si - log(sum(exp() + ...)))
+    //               ^^^^^^^^^^^^^^^^^^^^^
+    //                       LSE
+    std::vector<ck::index_t> lse_gs_ms_lengths{G0, G1, M};
+    std::vector<ck::index_t> lse_gs_ms_strides{G1 * M, M, 1}; // LSE layout [G0, G1, M]
+
+    SimpleDeviceMem q_device_buf(sizeof(InputDataType) * G0 * G1 * M * K);
+    SimpleDeviceMem k_device_buf(sizeof(InputDataType) * G0 * G1 * N * K);
+    SimpleDeviceMem z_device_buf(sizeof(ZDataType) * G0 * G1 * M * N);
+    SimpleDeviceMem v_device_buf(sizeof(InputDataType) * G0 * G1 * O * N);
+    SimpleDeviceMem y_device_buf(sizeof(InputDataType) * G0 * G1 * M * O);
+    SimpleDeviceMem lse_device_buf(sizeof(LSEDataType) * G0 * G1 * M);
+    SimpleDeviceMem qgrad_device_buf(sizeof(OutputDataType) * G0 * G1 * M * K);
+    SimpleDeviceMem kgrad_device_buf(sizeof(OutputDataType) * G0 * G1 * N * K);
+    SimpleDeviceMem vgrad_device_buf(sizeof(OutputDataType) * G0 * G1 * O * N);
+    SimpleDeviceMem ygrad_device_buf(sizeof(InputDataType) * G0 * G1 * M * O);
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackwardQloopV1<2,
+                                                                                     1,
+                                                                                     1,
+                                                                                     1,
+                                                                                     1,
+                                                                                     InputDataType,
+                                                                                     OutputDataType,
+                                                                                     ZDataType,
+                                                                                     LSEDataType,
+                                                                                     ck::Tuple<>,
+                                                                                     ck::Tuple<>,
+                                                                                     QKVElementOp,
+                                                                                     QKVElementOp,
+                                                                                     Scale,
+                                                                                     QKVElementOp,
+                                                                                     YElementOp,
+                                                                                     MaskingSpec>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            q_device_buf.GetDeviceBuffer(),
+            k_device_buf.GetDeviceBuffer(),
+            nullptr, // set to nullptr
+            v_device_buf.GetDeviceBuffer(),
+            y_device_buf.GetDeviceBuffer(),
+            lse_device_buf.GetDeviceBuffer(),
+            ygrad_device_buf.GetDeviceBuffer(),
+            qgrad_device_buf.GetDeviceBuffer(),
+            kgrad_device_buf.GetDeviceBuffer(),
+            vgrad_device_buf.GetDeviceBuffer(),
+            {}, // std::array<void*, 1> p_acc0_biases;
+            {}, // std::array<void*, 1> p_acc1_biases;
+            q_gs_ms_ks_lengths,
+            q_gs_ms_ks_strides,
+            k_gs_ns_ks_lengths,
+            k_gs_ns_ks_strides,
+            z_gs_ms_ns_lengths,
+            z_gs_ms_ns_strides,
+            v_gs_os_ns_lengths,
+            v_gs_os_ns_strides,
+            y_gs_ms_os_lengths,
+            y_gs_ms_os_strides,
+            lse_gs_ms_lengths,
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_lengths},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_strides},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_lengths},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_strides},
+            QKVElementOp{},
+            QKVElementOp{},
+            Scale{alpha},
+            QKVElementOp{},
+            YElementOp{},
+            p_drop,
+            std::tuple<unsigned long long, unsigned long long>(seed, offset));
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = (size_t(3) * M * N * K + size_t(2) * M * N * O) * 2 * BatchCount;
+
+            std::size_t num_btype =
+                (sizeof(InputDataType) * M * K + sizeof(InputDataType) * K * N +
+                 sizeof(InputDataType) * N * O + sizeof(InputDataType) * M * O * size_t(2) +
+                 sizeof(OutputDataType) * M * K + sizeof(OutputDataType) * K * N +
+                 sizeof(OutputDataType) * N * O) *
+                    BatchCount +
+                sizeof(LSEDataType) * M * BatchCount;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best instance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            q_device_buf.GetDeviceBuffer(),
+            k_device_buf.GetDeviceBuffer(),
+            nullptr, // set to nullptr
+            v_device_buf.GetDeviceBuffer(),
+            y_device_buf.GetDeviceBuffer(),
+            lse_device_buf.GetDeviceBuffer(),
+            ygrad_device_buf.GetDeviceBuffer(),
+            qgrad_device_buf.GetDeviceBuffer(),
+            kgrad_device_buf.GetDeviceBuffer(),
+            vgrad_device_buf.GetDeviceBuffer(),
+            {}, // std::array<void*, 1> p_acc0_biases;
+            {}, // std::array<void*, 1> p_acc1_biases;
+            q_gs_ms_ks_lengths,
+            q_gs_ms_ks_strides,
+            k_gs_ns_ks_lengths,
+            k_gs_ns_ks_strides,
+            z_gs_ms_ns_lengths,
+            z_gs_ms_ns_strides,
+            v_gs_os_ns_lengths,
+            v_gs_os_ns_strides,
+            y_gs_ms_os_lengths,
+            y_gs_ms_os_strides,
+            lse_gs_ms_lengths,
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_lengths},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_strides},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_lengths},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_strides},
+            QKVElementOp{},
+            QKVElementOp{},
+            Scale{alpha},
+            QKVElementOp{},
+            YElementOp{},
+            p_drop,
+            std::tuple<unsigned long long, unsigned long long>(seed, offset));
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/client_example/08_fused_attention/fused_attention_bwd_qloop_v2.cpp
+++ b/client_example/08_fused_attention/fused_attention_bwd_qloop_v2.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#define USING_MASK 0
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <fstream>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_mha_bwd_qloop_v2.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16   = ck::half_t;
+using BF16  = ck::bhalf_t;
+using F32   = float;
+using U16   = unsigned short;
+using INT32 = int32_t;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+using QKVElementOp = PassThrough;
+using YElementOp   = PassThrough;
+
+using InputDataType    = F16;
+using OutputDataType   = F16;
+using GemmDataType     = F16;
+using AccDataType      = F32;
+using ShuffleDataType  = F32;
+using LSEDataType      = F32;
+using ZDataType        = U16; // INT32
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+
+static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock = 8;
+
+#if USING_MASK
+static constexpr auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskUpperTriangleFromTopLeft;
+#else
+static constexpr auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+#endif
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    int init_method = 1;
+
+    ck::index_t M  = 512;
+    ck::index_t N  = 512;
+    ck::index_t K  = 128;
+    ck::index_t O  = 128;
+    ck::index_t G0 = 4;
+    ck::index_t G1 = 6;
+
+    bool input_permute  = false;
+    bool output_permute = false;
+
+    float p_drop                    = 0.0;
+    const unsigned long long seed   = 1;
+    const unsigned long long offset = 0;
+
+    float p_dropout               = 1 - p_drop;
+    ZDataType p_dropout_in_16bits = ZDataType(std::floor(p_dropout * 65535.0));
+    float rp_dropout              = 1.0 / p_dropout;
+    float alpha                   = 1.f / std::sqrt(K);
+
+    std::cout << "init_method: " << init_method << std::endl;
+    std::cout << "M: " << M << std::endl;
+    std::cout << "N: " << N << std::endl;
+    std::cout << "K: " << K << std::endl;
+    std::cout << "O: " << O << std::endl;
+    std::cout << "G0: " << G0 << std::endl;
+    std::cout << "G1: " << G1 << std::endl;
+    std::cout << "alpha: " << alpha << std::endl;
+    std::cout << "input_permute: " << input_permute << std::endl;
+    std::cout << "output_permute: " << output_permute << std::endl;
+    std::cout << "p_drop: " << p_drop << std::endl;
+    std::cout << "seed: " << seed << std::endl;
+    std::cout << "offset: " << offset << std::endl;
+
+    const ck::index_t BatchCount = G0 * G1;
+
+    std::vector<ck::index_t> q_gs_ms_ks_lengths{G0, G1, M, K};
+    std::vector<ck::index_t> q_gs_ms_ks_strides =
+        input_permute
+            ? std::vector<ck::index_t>{M * G1 * K, K, G1 * K, 1} // Q layout [G0, M, G1, K]
+            : std::vector<ck::index_t>{G1 * M * K, M * K, K, 1}; // Q layout [G0, G1, M, K]
+
+    std::vector<ck::index_t> k_gs_ns_ks_lengths{G0, G1, N, K};
+    std::vector<ck::index_t> k_gs_ns_ks_strides =
+        input_permute
+            ? std::vector<ck::index_t>{N * G1 * K, K, G1 * K, 1} // K layout [G0, N, G1, K]
+            : std::vector<ck::index_t>{G1 * N * K, N * K, K, 1}; // K layout [G0, G1, N, K]
+
+    std::vector<ck::index_t> v_gs_os_ns_lengths{G0, G1, O, N};
+    std::vector<ck::index_t> v_gs_os_ns_strides =
+        input_permute
+            ? std::vector<ck::index_t>{N * G1 * O, O, 1, G1 * O} // V layout [G0, N, G1, O]
+            : std::vector<ck::index_t>{G1 * N * O, N * O, 1, O}; // V layout [G0, G1, N, O]
+
+    std::vector<ck::index_t> y_gs_ms_os_lengths{G0, G1, M, O};
+    std::vector<ck::index_t> y_gs_ms_os_strides =
+        output_permute
+            ? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // Y layout [G0, M, G1, O]
+            : std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // Y layout [G0, G1, M, O]
+
+    std::vector<ck::index_t> z_gs_ms_ns_lengths{G0, G1, M, N};
+    std::vector<ck::index_t> z_gs_ms_ns_strides =
+        input_permute
+            ? std::vector<ck::index_t>{M * G1 * N, N, G1 * N, 1} // Z layout [G0, M, G1, N]
+            : std::vector<ck::index_t>{G1 * M * N, M * N, N, 1}; // Z layout [G0, G1, M, N]
+    // The softmax stat log-sum-exp (LSE) is used to speed up softmax calculation in backward pass
+    // Pi = exp(Si) / sum(exp(S0) + exp(S1) + ...)
+    //    = exp(Si) / exp(log(sum(exp() + ...)))
+    //    = exp(Si - log(sum(exp() + ...)))
+    //               ^^^^^^^^^^^^^^^^^^^^^
+    //                       LSE
+    std::vector<ck::index_t> lse_gs_ms_lengths{G0, G1, M};
+    std::vector<ck::index_t> lse_gs_ms_strides{G1 * M, M, 1}; // LSE layout [G0, G1, M]
+
+    SimpleDeviceMem q_device_buf(sizeof(InputDataType) * G0 * G1 * M * K);
+    SimpleDeviceMem k_device_buf(sizeof(InputDataType) * G0 * G1 * N * K);
+    SimpleDeviceMem z_device_buf(sizeof(ZDataType) * G0 * G1 * M * N);
+    SimpleDeviceMem v_device_buf(sizeof(InputDataType) * G0 * G1 * O * N);
+    SimpleDeviceMem y_device_buf(sizeof(InputDataType) * G0 * G1 * M * O);
+    SimpleDeviceMem lse_device_buf(sizeof(LSEDataType) * G0 * G1 * M);
+    SimpleDeviceMem qgrad_device_buf(sizeof(OutputDataType) * G0 * G1 * M * K);
+    SimpleDeviceMem kgrad_device_buf(sizeof(OutputDataType) * G0 * G1 * N * K);
+    SimpleDeviceMem vgrad_device_buf(sizeof(OutputDataType) * G0 * G1 * O * N);
+    SimpleDeviceMem ygrad_device_buf(sizeof(InputDataType) * G0 * G1 * M * O);
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackwardQloopV2<2,
+                                                                                     1,
+                                                                                     1,
+                                                                                     1,
+                                                                                     1,
+                                                                                     InputDataType,
+                                                                                     OutputDataType,
+                                                                                     ZDataType,
+                                                                                     LSEDataType,
+                                                                                     ck::Tuple<>,
+                                                                                     ck::Tuple<>,
+                                                                                     QKVElementOp,
+                                                                                     QKVElementOp,
+                                                                                     Scale,
+                                                                                     QKVElementOp,
+                                                                                     YElementOp,
+                                                                                     MaskingSpec>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            q_device_buf.GetDeviceBuffer(),
+            k_device_buf.GetDeviceBuffer(),
+            nullptr, // set to nullptr
+            v_device_buf.GetDeviceBuffer(),
+            y_device_buf.GetDeviceBuffer(),
+            lse_device_buf.GetDeviceBuffer(),
+            ygrad_device_buf.GetDeviceBuffer(),
+            qgrad_device_buf.GetDeviceBuffer(),
+            kgrad_device_buf.GetDeviceBuffer(),
+            vgrad_device_buf.GetDeviceBuffer(),
+            {}, // std::array<void*, 1> p_acc0_biases;
+            {}, // std::array<void*, 1> p_acc1_biases;
+            q_gs_ms_ks_lengths,
+            q_gs_ms_ks_strides,
+            k_gs_ns_ks_lengths,
+            k_gs_ns_ks_strides,
+            z_gs_ms_ns_lengths,
+            z_gs_ms_ns_strides,
+            v_gs_os_ns_lengths,
+            v_gs_os_ns_strides,
+            y_gs_ms_os_lengths,
+            y_gs_ms_os_strides,
+            lse_gs_ms_lengths,
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_lengths},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_strides},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_lengths},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_strides},
+            QKVElementOp{},
+            QKVElementOp{},
+            Scale{alpha},
+            QKVElementOp{},
+            YElementOp{},
+            p_drop,
+            std::tuple<unsigned long long, unsigned long long>(seed, offset));
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = (size_t(3) * M * N * K + size_t(2) * M * N * O) * 2 * BatchCount;
+
+            std::size_t num_btype =
+                (sizeof(InputDataType) * M * K + sizeof(InputDataType) * K * N +
+                 sizeof(InputDataType) * N * O + sizeof(InputDataType) * M * O * size_t(2) +
+                 sizeof(OutputDataType) * M * K + sizeof(OutputDataType) * K * N +
+                 sizeof(OutputDataType) * N * O) *
+                    BatchCount +
+                sizeof(LSEDataType) * M * BatchCount;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best instance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            q_device_buf.GetDeviceBuffer(),
+            k_device_buf.GetDeviceBuffer(),
+            nullptr, // set to nullptr
+            v_device_buf.GetDeviceBuffer(),
+            y_device_buf.GetDeviceBuffer(),
+            lse_device_buf.GetDeviceBuffer(),
+            ygrad_device_buf.GetDeviceBuffer(),
+            qgrad_device_buf.GetDeviceBuffer(),
+            kgrad_device_buf.GetDeviceBuffer(),
+            vgrad_device_buf.GetDeviceBuffer(),
+            {}, // std::array<void*, 1> p_acc0_biases;
+            {}, // std::array<void*, 1> p_acc1_biases;
+            q_gs_ms_ks_lengths,
+            q_gs_ms_ks_strides,
+            k_gs_ns_ks_lengths,
+            k_gs_ns_ks_strides,
+            z_gs_ms_ns_lengths,
+            z_gs_ms_ns_strides,
+            v_gs_os_ns_lengths,
+            v_gs_os_ns_strides,
+            y_gs_ms_os_lengths,
+            y_gs_ms_os_strides,
+            lse_gs_ms_lengths,
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_lengths},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_strides},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_lengths},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_strides},
+            QKVElementOp{},
+            QKVElementOp{},
+            Scale{alpha},
+            QKVElementOp{},
+            YElementOp{},
+            p_drop,
+            std::tuple<unsigned long long, unsigned long long>(seed, offset));
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_mha_bwd_qloop.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_mha_bwd_qloop.hpp
@@ -17,8 +17,8 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-void add_device_batched_mha_bwd_qloop_casual_f16_f16_gmk_gnk_gno_gmo_instances(
-    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackward<
+void add_device_batched_mha_bwd_qloop_v1_casual_f16_f16_instances(
+    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackwardQloopV1<
        2,
        1,
        1,
@@ -37,9 +37,9 @@ void add_device_batched_mha_bwd_qloop_casual_f16_f16_gmk_gnk_gno_gmo_instances(
        PassThrough,
        MaskingSpecialization::MaskUpperTriangleFromTopLeft>>>& instances);

-void add_device_batched_mha_bwd_qloop_noncasual_f16_f16_gmk_gnk_gno_gmo_instances(
+void add_device_batched_mha_bwd_qloop_v1_noncasual_f16_f16_instances(
    std::vector<std::unique_ptr<
-        DeviceBatchedMultiheadAttentionBackward<2,
+        DeviceBatchedMultiheadAttentionBackwardQloopV1<2,
                                                       1,
                                                       1,
                                                       1,
@@ -55,10 +55,11 @@ void add_device_batched_mha_bwd_qloop_noncasual_f16_f16_gmk_gnk_gno_gmo_instance
                                                       Scale,
                                                       PassThrough,
                                                       PassThrough,
-                                                MaskingSpecialization::MaskDisabled>>>& instances);
+                                                       MaskingSpecialization::MaskDisabled>>>&
+        instances);

-void add_device_batched_mha_bwd_qloop_casual_bf16_bf16_gmk_gnk_gno_gmo_instances(
-    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackward<
+void add_device_batched_mha_bwd_qloop_v1_casual_bf16_bf16_instances(
+    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackwardQloopV1<
        2,
        1,
        1,
@@ -77,9 +78,9 @@ void add_device_batched_mha_bwd_qloop_casual_bf16_bf16_gmk_gnk_gno_gmo_instances
        PassThrough,
        MaskingSpecialization::MaskUpperTriangleFromTopLeft>>>& instances);

-void add_device_batched_mha_bwd_qloop_noncasual_bf16_bf16_gmk_gnk_gno_gmo_instances(
+void add_device_batched_mha_bwd_qloop_v1_noncasual_bf16_bf16_instances(
    std::vector<std::unique_ptr<
-        DeviceBatchedMultiheadAttentionBackward<2,
+        DeviceBatchedMultiheadAttentionBackwardQloopV1<2,
                                                       1,
                                                       1,
                                                       1,
@@ -95,7 +96,8 @@ void add_device_batched_mha_bwd_qloop_noncasual_bf16_bf16_gmk_gnk_gno_gmo_instan
                                                       Scale,
                                                       PassThrough,
                                                       PassThrough,
-                                                MaskingSpecialization::MaskDisabled>>>& instances);
+                                                       MaskingSpecialization::MaskDisabled>>>&
+        instances);

 template <typename InputDataType,
          typename OutputDataType,
@@ -103,7 +105,7 @@ template <typename InputDataType,
          typename LSEDataType,
          MaskingSpecialization MaskingSpec>
 struct DeviceOperationInstanceFactory<
-    ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward<2,
+    ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackwardQloopV1<2,
                                                                                 1,
                                                                                 1,
                                                                                 1,
@@ -121,7 +123,7 @@ struct DeviceOperationInstanceFactory<
                                                                                 PassThrough,
                                                                                 MaskingSpec>>
 {
-    using DeviceOp = DeviceBatchedMultiheadAttentionBackward<2,
+    using DeviceOp = DeviceBatchedMultiheadAttentionBackwardQloopV1<2,
                                                                    1,
                                                                    1,
                                                                    1,
@@ -148,12 +150,11 @@ struct DeviceOperationInstanceFactory<
        {
            if constexpr(MaskingSpec == MaskingSpecialization::MaskUpperTriangleFromTopLeft)
            {
-                add_device_batched_mha_bwd_qloop_casual_f16_f16_gmk_gnk_gno_gmo_instances(op_ptrs);
+                add_device_batched_mha_bwd_qloop_v1_casual_f16_f16_instances(op_ptrs);
            }
            else if(MaskingSpec == MaskingSpecialization::MaskDisabled)
            {
-                add_device_batched_mha_bwd_qloop_noncasual_f16_f16_gmk_gnk_gno_gmo_instances(
-                    op_ptrs);
+                add_device_batched_mha_bwd_qloop_v1_noncasual_f16_f16_instances(op_ptrs);
            }
        }
        else if constexpr(is_same_v<InputDataType, BF16> && is_same_v<OutputDataType, BF16> &&
@@ -161,13 +162,11 @@ struct DeviceOperationInstanceFactory<
        {
            if constexpr(MaskingSpec == MaskingSpecialization::MaskUpperTriangleFromTopLeft)
            {
-                add_device_batched_mha_bwd_qloop_casual_bf16_bf16_gmk_gnk_gno_gmo_instances(
-                    op_ptrs);
+                add_device_batched_mha_bwd_qloop_v1_casual_bf16_bf16_instances(op_ptrs);
            }
            else if(MaskingSpec == MaskingSpecialization::MaskDisabled)
            {
-                add_device_batched_mha_bwd_qloop_noncasual_bf16_bf16_gmk_gnk_gno_gmo_instances(
-                    op_ptrs);
+                add_device_batched_mha_bwd_qloop_v1_noncasual_bf16_bf16_instances(op_ptrs);
            }
        }
        return op_ptrs;

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_mha_bwd_qloop_v2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_mha_bwd_qloop_v2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_mha_bwd_qloop_v2_casual_f16_f16_instances(
+    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackwardQloopV2<
+        2,
+        1,
+        1,
+        1,
+        1,
+        F16,
+        F16,
+        unsigned short,
+        F32,
+        ck::Tuple<>,
+        ck::Tuple<>,
+        PassThrough,
+        PassThrough,
+        Scale,
+        PassThrough,
+        PassThrough,
+        MaskingSpecialization::MaskUpperTriangleFromTopLeft>>>& instances);
+
+void add_device_batched_mha_bwd_qloop_v2_noncasual_f16_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedMultiheadAttentionBackwardQloopV2<2,
+                                                       1,
+                                                       1,
+                                                       1,
+                                                       1,
+                                                       F16,
+                                                       F16,
+                                                       unsigned short,
+                                                       F32,
+                                                       ck::Tuple<>,
+                                                       ck::Tuple<>,
+                                                       PassThrough,
+                                                       PassThrough,
+                                                       Scale,
+                                                       PassThrough,
+                                                       PassThrough,
+                                                       MaskingSpecialization::MaskDisabled>>>&
+        instances);
+
+void add_device_batched_mha_bwd_qloop_v2_casual_bf16_bf16_instances(
+    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackwardQloopV2<
+        2,
+        1,
+        1,
+        1,
+        1,
+        BF16,
+        BF16,
+        unsigned short,
+        F32,
+        ck::Tuple<>,
+        ck::Tuple<>,
+        PassThrough,
+        PassThrough,
+        Scale,
+        PassThrough,
+        PassThrough,
+        MaskingSpecialization::MaskUpperTriangleFromTopLeft>>>& instances);
+
+void add_device_batched_mha_bwd_qloop_v2_noncasual_bf16_bf16_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedMultiheadAttentionBackwardQloopV2<2,
+                                                       1,
+                                                       1,
+                                                       1,
+                                                       1,
+                                                       BF16,
+                                                       BF16,
+                                                       unsigned short,
+                                                       F32,
+                                                       ck::Tuple<>,
+                                                       ck::Tuple<>,
+                                                       PassThrough,
+                                                       PassThrough,
+                                                       Scale,
+                                                       PassThrough,
+                                                       PassThrough,
+                                                       MaskingSpecialization::MaskDisabled>>>&
+        instances);
+
+template <typename InputDataType,
+          typename OutputDataType,
+          typename ZDataType,
+          typename LSEDataType,
+          MaskingSpecialization MaskingSpec>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackwardQloopV2<2,
+                                                                                 1,
+                                                                                 1,
+                                                                                 1,
+                                                                                 1,
+                                                                                 InputDataType,
+                                                                                 OutputDataType,
+                                                                                 ZDataType,
+                                                                                 LSEDataType,
+                                                                                 ck::Tuple<>,
+                                                                                 ck::Tuple<>,
+                                                                                 PassThrough,
+                                                                                 PassThrough,
+                                                                                 Scale,
+                                                                                 PassThrough,
+                                                                                 PassThrough,
+                                                                                 MaskingSpec>>
+{
+    using DeviceOp = DeviceBatchedMultiheadAttentionBackwardQloopV2<2,
+                                                                    1,
+                                                                    1,
+                                                                    1,
+                                                                    1,
+                                                                    InputDataType,
+                                                                    OutputDataType,
+                                                                    ZDataType,
+                                                                    LSEDataType,
+                                                                    ck::Tuple<>,
+                                                                    ck::Tuple<>,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    Scale,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    MaskingSpec>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<InputDataType, half_t> && is_same_v<OutputDataType, half_t> &&
+                     is_same_v<ZDataType, unsigned short> && is_same_v<LSEDataType, float>)
+        {
+            if constexpr(MaskingSpec == MaskingSpecialization::MaskUpperTriangleFromTopLeft)
+            {
+                add_device_batched_mha_bwd_qloop_v2_casual_f16_f16_instances(op_ptrs);
+            }
+            else if(MaskingSpec == MaskingSpecialization::MaskDisabled)
+            {
+                add_device_batched_mha_bwd_qloop_v2_noncasual_f16_f16_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<InputDataType, BF16> && is_same_v<OutputDataType, BF16> &&
+                          is_same_v<ZDataType, unsigned short> && is_same_v<LSEDataType, float>)
+        {
+            if constexpr(MaskingSpec == MaskingSpecialization::MaskUpperTriangleFromTopLeft)
+            {
+                add_device_batched_mha_bwd_qloop_v2_casual_bf16_bf16_instances(op_ptrs);
+            }
+            else if(MaskingSpec == MaskingSpecialization::MaskDisabled)
+            {
+                add_device_batched_mha_bwd_qloop_v2_noncasual_bf16_bf16_instances(op_ptrs);
+            }
+        }
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt
@@ -3,8 +3,10 @@ add_instance_library(device_batched_gemm_softmax_gemm_permute_instance
    device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
    device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
    device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
-    device_batched_mha_bwd_qloop_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
-    device_batched_mha_bwd_qloop_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+    device_batched_mha_bwd_qloop_v1_bf16_bf16_instance.cpp
+    device_batched_mha_bwd_qloop_v1_f16_f16_instance.cpp
+    device_batched_mha_bwd_qloop_v2_bf16_bf16_instance.cpp
+    device_batched_mha_bwd_qloop_v2_f16_f16_instance.cpp
    device_batched_mha_bwd_qloop_light_v1_bf16_bf16_instance.cpp
    device_batched_mha_bwd_qloop_light_v1_f16_f16_instance.cpp
    device_batched_mha_bwd_qloop_light_v2_bf16_bf16_instance.cpp

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_mha_bwd_qloop_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_mha_bwd_qloop_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -51,21 +51,22 @@ template <index_t NumDimG,
          index_t NumDimK,
          index_t NumDimO,
          MaskingSpecialization MaskingSpec>
-using device_batched_mha_bwd_qloop_bf16_bf16_gmk_gnk_gno_gmo_instances = std::tuple<
+using device_batched_mha_bwd_qloop_v1_bf16_bf16_instances = std::tuple<
    // clang-format off
        // ########################################################################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| InputDataType| OutputDataType| GemmDataType|      ZDataType| LSEDataType| Acc0BiasDataType| Acc1BiasDataType|  GemmAcc| CShuffle|            A|            B|         Acc|           B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1| Gemm2|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths| CShuffleBlockTransferScalarPerVector_NPerBlock| MaskingSpec| Deterministic|
        // ########################################################################################|         |        |        |        |        |              |               |             |               |            |                 |                 | DataType| DataType|  Elementwise|  Elementwise| Elementwise|  Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|                                               |            |              |
        // ########################################################################################|         |        |        |        |        |              |               |             |               |            |                 |                 |         |         |    Operation|    Operation|   Operation|    Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|                                               |            |              |
        // ########################################################################################|         |        |        |        |        |              |               |             |               |            |                 |                 |         |         |             |             |            |             |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |            |            |                             |                                               |            |              |
        ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,          BF16,           BF16,         BF16, unsigned short,         F32, Acc0BiasDataType, Acc1BiasDataType,      F32,      F32, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,     64,   128,    64,    64,    32,   8,   8,    2,   32,   32,     2,     1,     2,     1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,           1,           2,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>
+
        // ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,          BF16,           BF16,         BF16, unsigned short,         F32, Acc0BiasDataType, Acc1BiasDataType,      F32,      F32, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     4,     1,     2,     1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,           1,           2,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>,               
        // ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,          BF16,           BF16,         BF16, unsigned short,         F32, Acc0BiasDataType, Acc1BiasDataType,      F32,      F32, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,    128,   128,    32,    64,    32,   8,   8,    2,   32,   32,     4,     1,     2,     1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,           1,           2,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>,
        // ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,          BF16,           BF16,         BF16, unsigned short,         F32, Acc0BiasDataType, Acc1BiasDataType,      F32,      F32, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     4,     1,     2,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,           1,           2,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>
    // clang-format on
    >;

-void add_device_batched_mha_bwd_qloop_casual_bf16_bf16_gmk_gnk_gno_gmo_instances(
-    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackward<
+void add_device_batched_mha_bwd_qloop_v1_casual_bf16_bf16_instances(
+    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackwardQloopV1<
        2,
        1,
        1,
@@ -85,7 +86,7 @@ void add_device_batched_mha_bwd_qloop_casual_bf16_bf16_gmk_gnk_gno_gmo_instances
        MaskingSpecialization::MaskUpperTriangleFromTopLeft>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   device_batched_mha_bwd_qloop_bf16_bf16_gmk_gnk_gno_gmo_instances<
+                                   device_batched_mha_bwd_qloop_v1_bf16_bf16_instances<
                                       2,
                                       1,
                                       1,
@@ -94,9 +95,9 @@ void add_device_batched_mha_bwd_qloop_casual_bf16_bf16_gmk_gnk_gno_gmo_instances
                                       MaskingSpecialization::MaskUpperTriangleFromTopLeft>{});
 }

-void add_device_batched_mha_bwd_qloop_noncasual_bf16_bf16_gmk_gnk_gno_gmo_instances(
+void add_device_batched_mha_bwd_qloop_v1_noncasual_bf16_bf16_instances(
    std::vector<std::unique_ptr<
-        DeviceBatchedMultiheadAttentionBackward<2,
+        DeviceBatchedMultiheadAttentionBackwardQloopV1<2,
                                                       1,
                                                       1,
                                                       1,
@@ -112,11 +113,12 @@ void add_device_batched_mha_bwd_qloop_noncasual_bf16_bf16_gmk_gnk_gno_gmo_instan
                                                       Scale,
                                                       PassThrough,
                                                       PassThrough,
-                                                MaskingSpecialization::MaskDisabled>>>& instances)
+                                                       MaskingSpecialization::MaskDisabled>>>&
+        instances)
 {
-    add_device_operation_instances(instances,
-                                   device_batched_mha_bwd_qloop_bf16_bf16_gmk_gnk_gno_gmo_instances<
-                                       2,
+    add_device_operation_instances(
+        instances,
+        device_batched_mha_bwd_qloop_v1_bf16_bf16_instances<2,
                                                            1,
                                                            1,
                                                            1,

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_mha_bwd_qloop_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_mha_bwd_qloop_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -51,7 +51,7 @@ template <index_t NumDimG,
          index_t NumDimK,
          index_t NumDimO,
          MaskingSpecialization MaskingSpec>
-using device_batched_mha_bwd_qloop_f16_f16_gmk_gnk_gno_gmo_instances = std::tuple<
+using device_batched_mha_bwd_qloop_v1_f16_f16_instances = std::tuple<
    // clang-format off
        // ########################################################################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| InputDataType| OutputDataType| GemmDataType|      ZDataType| LSEDataType| Acc0BiasDataType| Acc1BiasDataType|  GemmAcc| CShuffle|            A|            B|         Acc|           B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1| Gemm2|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths| CShuffleBlockTransferScalarPerVector_NPerBlock| MaskingSpec| Deterministic|
        // ########################################################################################|         |        |        |        |        |              |               |             |               |            |                 |                 | DataType| DataType|  Elementwise|  Elementwise| Elementwise|  Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|                                               |            |              |
@@ -64,8 +64,8 @@ using device_batched_mha_bwd_qloop_f16_f16_gmk_gnk_gno_gmo_instances = std::tupl
    // clang-format on
    >;

-void add_device_batched_mha_bwd_qloop_casual_f16_f16_gmk_gnk_gno_gmo_instances(
-    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackward<
+void add_device_batched_mha_bwd_qloop_v1_casual_f16_f16_instances(
+    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackwardQloopV1<
        2,
        1,
        1,
@@ -85,7 +85,7 @@ void add_device_batched_mha_bwd_qloop_casual_f16_f16_gmk_gnk_gno_gmo_instances(
        MaskingSpecialization::MaskUpperTriangleFromTopLeft>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   device_batched_mha_bwd_qloop_f16_f16_gmk_gnk_gno_gmo_instances<
+                                   device_batched_mha_bwd_qloop_v1_f16_f16_instances<
                                       2,
                                       1,
                                       1,
@@ -94,9 +94,9 @@ void add_device_batched_mha_bwd_qloop_casual_f16_f16_gmk_gnk_gno_gmo_instances(
                                       MaskingSpecialization::MaskUpperTriangleFromTopLeft>{});
 }

-void add_device_batched_mha_bwd_qloop_noncasual_f16_f16_gmk_gnk_gno_gmo_instances(
+void add_device_batched_mha_bwd_qloop_v1_noncasual_f16_f16_instances(
    std::vector<std::unique_ptr<
-        DeviceBatchedMultiheadAttentionBackward<2,
+        DeviceBatchedMultiheadAttentionBackwardQloopV1<2,
                                                       1,
                                                       1,
                                                       1,
@@ -112,11 +112,12 @@ void add_device_batched_mha_bwd_qloop_noncasual_f16_f16_gmk_gnk_gno_gmo_instance
                                                       Scale,
                                                       PassThrough,
                                                       PassThrough,
-                                                MaskingSpecialization::MaskDisabled>>>& instances)
+                                                       MaskingSpecialization::MaskDisabled>>>&
+        instances)
 {
-    add_device_operation_instances(instances,
-                                   device_batched_mha_bwd_qloop_f16_f16_gmk_gnk_gno_gmo_instances<
-                                       2,
+    add_device_operation_instances(
+        instances,
+        device_batched_mha_bwd_qloop_v1_f16_f16_instances<2,
                                                          1,
                                                          1,
                                                          1,

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_mha_bwd_qloop_v2_bf16_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_mha_bwd_qloop_v2_bf16_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16   = ck::half_t;
+using BF16  = ck::bhalf_t;
+using F32   = float;
+using U16   = unsigned short;
+using INT32 = int32_t;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+using QKVElementOp = PassThrough;
+using YElementOp   = PassThrough;
+
+using InputDataType    = BF16;
+using OutputDataType   = BF16;
+using GemmDataType     = BF16;
+using AccDataType      = F32;
+using ShuffleDataType  = F32;
+using LSEDataType      = F32;
+using ZDataType        = U16; // INT32
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+// static constexpr auto TensorDefault =
+// ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecQ   = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecK   = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecV   = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecY   = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr bool Deterministic = false;
+
+static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock = 8;
+
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO,
+          MaskingSpecialization MaskingSpec>
+using device_batched_mha_bwd_qloop_v2_bf16_bf16_instances = std::tuple<
+    // clang-format off
+        // ########################################################################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| InputDataType| OutputDataType| GemmDataType|      ZDataType| LSEDataType| Acc0BiasDataType| Acc1BiasDataType|  GemmAcc| CShuffle|            A|            B|         Acc|           B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1| Gemm2|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths| CShuffleBlockTransferScalarPerVector_NPerBlock| MaskingSpec| Deterministic|
+        // ########################################################################################|         |        |        |        |        |              |               |             |               |            |                 |                 | DataType| DataType|  Elementwise|  Elementwise| Elementwise|  Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|                                               |            |              |
+        // ########################################################################################|         |        |        |        |        |              |               |             |               |            |                 |                 |         |         |    Operation|    Operation|   Operation|    Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|                                               |            |              |
+        // ########################################################################################|         |        |        |        |        |              |               |             |               |            |                 |                 |         |         |             |             |            |             |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |            |            |                             |                                               |            |              |
+        ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, InputDataType, OutputDataType, GemmDataType, ZDataType, LSEDataType, Acc0BiasDataType, Acc1BiasDataType, AccDataType, ShuffleDataType, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,     64,   128,   128,   128,    32,   8,   8,    2,   32,   32,     2,     1,     4,     1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>
+        // ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,          BF16,           BF16,         BF16, unsigned short,         F32, Acc0BiasDataType, Acc1BiasDataType,      F32,      F32, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     4,     1,     2,     1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,           1,           2,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>,               
+        // ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,          BF16,           BF16,         BF16, unsigned short,         F32, Acc0BiasDataType, Acc1BiasDataType,      F32,      F32, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,    128,   128,    32,    64,    32,   8,   8,    2,   32,   32,     4,     1,     2,     1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,           1,           2,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>,
+        // ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,          BF16,           BF16,         BF16, unsigned short,         F32, Acc0BiasDataType, Acc1BiasDataType,      F32,      F32, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     4,     1,     2,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,           1,           2,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>
+    // clang-format on
+    >;
+
+void add_device_batched_mha_bwd_qloop_v2_casual_bf16_bf16_instances(
+    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackwardQloopV2<
+        2,
+        1,
+        1,
+        1,
+        1,
+        BF16,
+        BF16,
+        unsigned short,
+        F32,
+        ck::Tuple<>,
+        ck::Tuple<>,
+        PassThrough,
+        PassThrough,
+        Scale,
+        PassThrough,
+        PassThrough,
+        MaskingSpecialization::MaskUpperTriangleFromTopLeft>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_mha_bwd_qloop_v2_bf16_bf16_instances<
+                                       2,
+                                       1,
+                                       1,
+                                       1,
+                                       1,
+                                       MaskingSpecialization::MaskUpperTriangleFromTopLeft>{});
+}
+
+void add_device_batched_mha_bwd_qloop_v2_noncasual_bf16_bf16_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedMultiheadAttentionBackwardQloopV2<2,
+                                                       1,
+                                                       1,
+                                                       1,
+                                                       1,
+                                                       BF16,
+                                                       BF16,
+                                                       unsigned short,
+                                                       F32,
+                                                       ck::Tuple<>,
+                                                       ck::Tuple<>,
+                                                       PassThrough,
+                                                       PassThrough,
+                                                       Scale,
+                                                       PassThrough,
+                                                       PassThrough,
+                                                       MaskingSpecialization::MaskDisabled>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_mha_bwd_qloop_v2_bf16_bf16_instances<2,
+                                                            1,
+                                                            1,
+                                                            1,
+                                                            1,
+                                                            MaskingSpecialization::MaskDisabled>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_mha_bwd_qloop_v2_f16_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_mha_bwd_qloop_v2_f16_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16   = ck::half_t;
+using BF16  = ck::bhalf_t;
+using F32   = float;
+using U16   = unsigned short;
+using INT32 = int32_t;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+using QKVElementOp = PassThrough;
+using YElementOp   = PassThrough;
+
+using InputDataType    = F16;
+using OutputDataType   = F16;
+using GemmDataType     = F16;
+using AccDataType      = F32;
+using ShuffleDataType  = F32;
+using LSEDataType      = F32;
+using ZDataType        = U16; // INT32
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+// static constexpr auto TensorDefault =
+// ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecQ   = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecK   = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecV   = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecY   = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr bool Deterministic = false;
+
+static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock = 8;
+
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO,
+          MaskingSpecialization MaskingSpec>
+using device_batched_mha_bwd_qloop_v2_f16_f16_instances = std::tuple<
+    // clang-format off
+        // ########################################################################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| InputDataType| OutputDataType| GemmDataType|      ZDataType| LSEDataType| Acc0BiasDataType| Acc1BiasDataType|  GemmAcc| CShuffle|            A|            B|         Acc|           B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1| Gemm2|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths| CShuffleBlockTransferScalarPerVector_NPerBlock| MaskingSpec| Deterministic|
+        // ########################################################################################|         |        |        |        |        |              |               |             |               |            |                 |                 | DataType| DataType|  Elementwise|  Elementwise| Elementwise|  Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|                                               |            |              |
+        // ########################################################################################|         |        |        |        |        |              |               |             |               |            |                 |                 |         |         |    Operation|    Operation|   Operation|    Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|                                               |            |              |
+        // ########################################################################################|         |        |        |        |        |              |               |             |               |            |                 |                 |         |         |             |             |            |             |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |            |            |                             |                                               |            |              |
+        ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, InputDataType, OutputDataType, GemmDataType, ZDataType, LSEDataType, Acc0BiasDataType, Acc1BiasDataType, AccDataType, ShuffleDataType, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,     64,   128,   128,   128,    32,   8,   8,    2,   32,   32,     2,     1,     4,     1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>
+        // ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,           F16,            F16,          F16, unsigned short,         F32, Acc0BiasDataType, Acc1BiasDataType,      F32,      F32, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     4,     1,     2,     1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,           1,           2,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>,               
+        // ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,           F16,            F16,          F16, unsigned short,         F32, Acc0BiasDataType, Acc1BiasDataType,      F32,      F32, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,    128,   128,    32,    64,    32,   8,   8,    2,   32,   32,     4,     1,     2,     1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,           1,           2,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>,
+        // ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,           F16,            F16,          F16, unsigned short,         F32, Acc0BiasDataType, Acc1BiasDataType,      F32,      F32, QKVElementOp, QKVElementOp,       Scale, QKVElementOp,  YElementOp,       GemmSpec,   TensorSpecQ,   TensorSpecK,   TensorSpecV,   TensorSpecY,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     4,     1,     2,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,           1,           2,               S<1, 32, 1, 8>, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, Deterministic>
+    // clang-format on
+    >;
+
+void add_device_batched_mha_bwd_qloop_v2_casual_f16_f16_instances(
+    std::vector<std::unique_ptr<DeviceBatchedMultiheadAttentionBackwardQloopV2<
+        2,
+        1,
+        1,
+        1,
+        1,
+        F16,
+        F16,
+        unsigned short,
+        F32,
+        ck::Tuple<>,
+        ck::Tuple<>,
+        PassThrough,
+        PassThrough,
+        Scale,
+        PassThrough,
+        PassThrough,
+        MaskingSpecialization::MaskUpperTriangleFromTopLeft>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_mha_bwd_qloop_v2_f16_f16_instances<
+                                       2,
+                                       1,
+                                       1,
+                                       1,
+                                       1,
+                                       MaskingSpecialization::MaskUpperTriangleFromTopLeft>{});
+}
+
+void add_device_batched_mha_bwd_qloop_v2_noncasual_f16_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedMultiheadAttentionBackwardQloopV2<2,
+                                                       1,
+                                                       1,
+                                                       1,
+                                                       1,
+                                                       F16,
+                                                       F16,
+                                                       unsigned short,
+                                                       F32,
+                                                       ck::Tuple<>,
+                                                       ck::Tuple<>,
+                                                       PassThrough,
+                                                       PassThrough,
+                                                       Scale,
+                                                       PassThrough,
+                                                       PassThrough,
+                                                       MaskingSpecialization::MaskDisabled>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_mha_bwd_qloop_v2_f16_f16_instances<2,
+                                                          1,
+                                                          1,
+                                                          1,
+                                                          1,
+                                                          MaskingSpecialization::MaskDisabled>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck