Update tuning method

78a300ff · Alan Turner · dea0555f · 78a300ff · 78a300ff · 78a300ff
Commit 78a300ff authored Oct 07, 2022 by Alan Turner
20 changed files
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct ConvolutionBackwardDataSpecialization
+{
+    Default,
+    Filter1x1Stride1Pad0,
+};
+
+inline std::string
+getConvBackwardDataSpecializationString(const ConvolutionBackwardDataSpecialization& s)
+{
+    switch(s)
+    {
+    case ConvolutionBackwardDataSpecialization::Default: return "Default";
+    case ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0:
+        return "FFilter1x1Stride1Pad0";
+    default: return "Unrecognized specialization!";
+    }
+}
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct ConvolutionBackwardWeightSpecialization
+{
+    Default,
+    Filter1x1Stride1Pad0,
+    Filter1x1Pad0,
+    OddC,
+};
+
+inline std::string
+getConvBackwardWeightSpecializationString(const ConvolutionBackwardWeightSpecialization& s)
+{
+    switch(s)
+    {
+    case ConvolutionBackwardWeightSpecialization::Default: return "Default";
+    case ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0:
+        return "Filter1x1Stride1Pad0";
+    case ConvolutionBackwardWeightSpecialization::Filter1x1Pad0: return "Filter1x1Pad0";
+    case ConvolutionBackwardWeightSpecialization::OddC: return "OddC";
+    default: return "Unrecognized specialization!";
+    }
+}
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct ConvolutionForwardSpecialization
+{
+    Default,
+    Filter1x1Pad0,
+    Filter1x1Stride1Pad0,
+    OddC,
+};
+
+inline std::string getConvForwardSpecializationString(const ConvolutionForwardSpecialization& s)
+{
+    switch(s)
+    {
+    case ConvolutionForwardSpecialization::Default: return "Default";
+    case ConvolutionForwardSpecialization::Filter1x1Pad0: return "Filter1x1Pad0";
+    case ConvolutionForwardSpecialization::Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
+    case ConvolutionForwardSpecialization::OddC: return "OddC";
+    default: return "Unrecognized specialization!";
+    }
+}
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_base.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cmath>
+#include <string>
+
+#include "ck/stream_config.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+struct BaseArgument
+{
+    BaseArgument()                    = default;
+    BaseArgument(const BaseArgument&) = default;
+    BaseArgument& operator=(const BaseArgument&) = default;
+
+    virtual ~BaseArgument() {}
+
+    void* p_workspace_ = nullptr;
+};
+
+struct BaseInvoker
+{
+    BaseInvoker()                   = default;
+    BaseInvoker(const BaseInvoker&) = default;
+    BaseInvoker& operator=(const BaseInvoker&) = default;
+
+    virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{})
+    {
+        return float{0};
+    }
+
+    virtual ~BaseInvoker() {}
+};
+
+struct BaseOperator
+{
+    BaseOperator()                    = default;
+    BaseOperator(const BaseOperator&) = default;
+    BaseOperator& operator=(const BaseOperator&) = default;
+
+    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
+    virtual std::string GetTypeString() const { return ""; }
+
+    virtual size_t GetWorkSpaceSize(const BaseArgument*) const { return 0; }
+
+    virtual void SetWorkSpacePointer(BaseArgument* p_arg, void* p_workspace) const
+    {
+        assert(p_arg);
+        p_arg->p_workspace_ = p_workspace;
+    }
+
+    virtual ~BaseOperator() {}
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedContractionMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::vector<index_t>& a_gs_ms_ns_lengths,
+                        const std::vector<index_t>& a_gs_ms_ks_strides,
+                        const std::vector<index_t>& b_gs_ns_ks_lengths,
+                        const std::vector<index_t>& b_gs_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                        const std::vector<index_t>& e_gs_ms_ns_lengths,
+                        const std::vector<index_t>& e_gs_ms_ns_strides,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatDsPointer,
+          typename FloatE,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ComputePtrOffsetOfBatch,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_contraction_multiple_d_xdl_cshuffle(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatDsPointer p_ds_grid,
+            FloatE* __restrict__ p_e_grid,
+            const index_t batch_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const Block2ETileMap block_2_etile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+
+    FloatDsPointer p_ds_grid_grp;
+
+    static constexpr index_t NumDTensor =
+        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_ds_grid_grp,
+                                                  p_e_grid + e_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_etile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = batch_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_etile_map;
+    ignore = compute_ptr_offset_of_batch;
+#endif
+}
+
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+
+// FIXME: TensorSpecialization::Packed specialization does not cover all packed tensor cases, it
+// merely degenerates into TensorSpecialization::Default with NumDimG/M/N/K = 1
+//
+// Detail- Packed tensor satisfies
+//   stride_0 = 1
+//   stride_i = stride_{i - 1} * extent_{i - 1}
+// So tensor
+//   [G0, G1, G2, M, N]
+// transposed into tensor
+//   [G0, G2, G1, M, N]
+// with strides
+//   [G2 * G1 * M * N, G1 * M * N, M * N, N, 1]
+// is again a packed tensor. MakeGridDescriptor() currently just merges dimensions and ignores some
+// strides from input tensor extents so finer dimension information is lost. Merging dimensions is
+// essentially a degenerated case of TensorSpecialization::Default with NumDimG/M/N/K = 1.
+//
+// Might need to expose dimension order to the interface to fully support
+// TensorSpecialization::Packed.
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          TensorSpecialization ASpec,
+          TensorSpecialization BSpec,
+          TensorSpecialization DESpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
+    : public DeviceBatchedContractionMultipleD<NumDimG,
+                                               NumDimM,
+                                               NumDimN,
+                                               NumDimK,
+                                               ADataType,
+                                               BDataType,
+                                               DsDataType,
+                                               EDataType,
+                                               AElementwiseOperation,
+                                               BElementwiseOperation,
+                                               CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedContractionMultipleD_Xdl_CShuffle;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    // Assume: A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...]
+    static auto MakeAGridDescriptor_M_K(const std::vector<index_t>& a_gs_ms_ks_lengths_vec,
+                                        const std::vector<index_t>& a_gs_ms_ks_strides_vec)
+    {
+        assert(a_gs_ms_ks_lengths_vec.size() == NumDimG + NumDimM + NumDimK &&
+               a_gs_ms_ks_strides_vec.size() == NumDimG + NumDimM + NumDimK);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto a_ms_ks_lengths = to_tuple(
+            a_gs_ms_ks_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimK>{});
+        const auto a_ms_ks_strides = to_tuple(
+            a_gs_ms_ks_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimK>{});
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+
+        // dimension Ids for K0, K1, ...
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimK, 1>::type{};
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(a_ms_ks_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds);
+
+        if constexpr(ASpec == TensorSpecialization::Packed)
+        {
+            auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+            auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{});
+            const auto a_grid_desc_mraw_kraw = make_naive_tensor_descriptor(
+                make_tuple(M, K),
+                make_tuple(a_ms_ks_strides[Number<NumDimM - 1>{}],
+                           a_ms_ks_strides[Number<NumDimM + NumDimK - 1>{}]));
+            return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+        }
+        else
+        {
+            // naive tensor A[M0, M1, M2, ..., K0, K1, K2...]
+            const auto a_grid_desc_ms_ks =
+                make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides);
+
+            // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...]
+            const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor(
+                a_grid_desc_ms_ks,
+                make_tuple(make_merge_transform(mLengths), make_merge_transform(kLengths)),
+                make_tuple(mDimIds, kDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+        }
+    }
+
+    // Assume: B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...]
+    static auto MakeBGridDescriptor_N_K(const std::vector<index_t>& b_gs_ns_ks_lengths_vec,
+                                        const std::vector<index_t>& b_gs_ns_ks_strides_vec)
+    {
+        assert(b_gs_ns_ks_lengths_vec.size() == NumDimG + NumDimN + NumDimK &&
+               b_gs_ns_ks_strides_vec.size() == NumDimG + NumDimN + NumDimK);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto b_ns_ks_lengths = to_tuple(
+            b_gs_ns_ks_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimN + NumDimK>{});
+        const auto b_ns_ks_strides = to_tuple(
+            b_gs_ns_ks_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimN + NumDimK>{});
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds = typename arithmetic_sequence_gen<0, NumDimN, 1>::type{};
+
+        // dimension Ids for K0, K1, ...
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDimN, NumDimN + NumDimK, 1>::type{};
+
+        // lengths for K0, K1, ...
+        const auto kLengths = get_container_subset(b_ns_ks_lengths, kDimIds);
+
+        // lengths for N0, N1, ...
+        const auto nLengths = get_container_subset(b_ns_ks_lengths, nDimIds);
+
+        if constexpr(BSpec == TensorSpecialization::Packed)
+        {
+            auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+            auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{});
+            const auto b_grid_desc_nraw_kraw = make_naive_tensor_descriptor(
+                make_tuple(N, K),
+                make_tuple(b_ns_ks_strides[Number<NumDimN - 1>{}],
+                           b_ns_ks_strides[Number<NumDimN + NumDimK - 1>{}]));
+            return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+        }
+        else
+        {
+            // naive tensor B[N0, N1, N2, ..., K0, K1, K2, ...]
+            const auto b_grid_desc_ns_ks =
+                make_naive_tensor_descriptor(b_ns_ks_lengths, b_ns_ks_strides);
+
+            // transformed tensor B[NRaw = N0 * N1 * N2 * ..., KRaw = K0 * K1 * K2 * ...]
+            const auto b_grid_desc_nraw_kraw = transform_tensor_descriptor(
+                b_grid_desc_ns_ks,
+                make_tuple(make_merge_transform(nLengths), make_merge_transform(kLengths)),
+                make_tuple(nDimIds, kDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+        }
+    }
+
+    // assume E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeEGridDescriptor_M_N(const std::vector<index_t>& e_gs_ms_ns_lengths_vec,
+                                        const std::vector<index_t>& e_gs_ms_ns_strides_vec)
+    {
+        assert(e_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
+               e_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto e_ms_ns_lengths = to_tuple(
+            e_gs_ms_ns_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        const auto e_ms_ns_strides = to_tuple(
+            e_gs_ms_ns_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimN, 1>::type{};
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(e_ms_ns_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(e_ms_ns_lengths, nDimIds);
+
+        if constexpr(DESpec == TensorSpecialization::Packed)
+        {
+            auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+            auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+            const auto e_grid_desc_mraw_nraw = make_naive_tensor_descriptor(
+                make_tuple(M, N),
+                make_tuple(e_ms_ns_strides[Number<NumDimM - 1>{}],
+                           e_ms_ns_strides[Number<NumDimM + NumDimN - 1>{}]));
+            return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+        }
+        else
+        {
+            // naive tensor E[M0, M1, M2, ..., N0, N1, N2...]
+            const auto e_grid_desc_ms_ns =
+                make_naive_tensor_descriptor(e_ms_ns_lengths, e_ms_ns_strides);
+
+            // transformed tensor E[MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * N2 * ...]
+            const auto e_grid_desc_mraw_nraw = transform_tensor_descriptor(
+                e_grid_desc_ms_ns,
+                make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)),
+                make_tuple(mDimIds, nDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+        }
+    }
+
+    // assume E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeEGridDescriptor_G_M_N(const std::vector<index_t>& e_gs_ms_ns_lengths_vec,
+                                          const std::vector<index_t>& e_gs_ms_ns_strides_vec)
+    {
+        assert(e_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
+               e_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto e_gs_ms_ns_lengths =
+            to_tuple(e_gs_ms_ns_lengths_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        const auto e_gs_ms_ns_strides =
+            to_tuple(e_gs_ms_ns_strides_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+
+        // dimension Ids for G0, G1, ...
+        constexpr auto gDimIds = typename arithmetic_sequence_gen<0, NumDimG, 1>::type{};
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds =
+            typename arithmetic_sequence_gen<NumDimG, NumDimG + NumDimM, 1>::type{};
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds = typename arithmetic_sequence_gen<NumDimG + NumDimM,
+                                                                  NumDimG + NumDimM + NumDimN,
+                                                                  1>::type{};
+
+        // lengths for G0, G1, ...
+        const auto gLengths = get_container_subset(e_gs_ms_ns_lengths, gDimIds);
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(e_gs_ms_ns_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(e_gs_ms_ns_lengths, nDimIds);
+
+        if constexpr(DESpec == TensorSpecialization::Packed)
+        {
+            auto G = container_reduce(gLengths, math::multiplies{}, Number<1>{});
+            auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+            auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+            const auto e_grid_desc_g_mraw_nraw = make_naive_tensor_descriptor(
+                make_tuple(G, M, N),
+                make_tuple(e_gs_ms_ns_strides[Number<NumDimG - 1>{}],
+                           e_gs_ms_ns_strides[Number<NumDimG + NumDimM - 1>{}],
+                           e_gs_ms_ns_strides[Number<NumDimG + NumDimM + NumDimN - 1>{}]));
+            // return matrix_padder.PadCDescriptor_M_N(e_grid_desc_g_mraw_nraw);
+            return e_grid_desc_g_mraw_nraw;
+        }
+        else
+        {
+            // naive tensor E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+            const auto e_grid_desc_gs_ms_ns =
+                make_naive_tensor_descriptor(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+
+            // transformed tensor E[G = G0 * G1 * ..., MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 *
+            // N2 * ...]
+            const auto e_grid_desc_g_mraw_nraw = transform_tensor_descriptor(
+                e_grid_desc_gs_ms_ns,
+                make_tuple(make_merge_transform(gLengths),
+                           make_merge_transform(mLengths),
+                           make_merge_transform(nLengths)),
+                make_tuple(gDimIds, mDimIds, nDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // return matrix_padder.PadCDescriptor_M_N(e_grid_desc_g_mraw_nraw);
+            return e_grid_desc_g_mraw_nraw;
+        }
+    }
+
+    static auto MakeDsGridDescriptor_M_N(
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths_vec,
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides_vec)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return DeviceOp::MakeEGridDescriptor_M_N(ds_gs_ms_ns_lengths_vec[i],
+                                                         ds_gs_ms_ns_strides_vec[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    static auto MakeDsGridDescriptor_G_M_N(
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths_vec,
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides_vec)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return DeviceOp::MakeEGridDescriptor_G_M_N(ds_gs_ms_ns_lengths_vec[i],
+                                                           ds_gs_ms_ns_strides_vec[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using AGridDesc_M_K  = decltype(MakeAGridDescriptor_M_K({}, {}));
+    using BGridDesc_N_K  = decltype(MakeBGridDescriptor_N_K({}, {}));
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({{}}, {{}}))>;
+    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N({}, {}));
+
+    using DsGridDesc_G_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_G_M_N({}, {}))>;
+    using EGridDesc_G_M_N  = decltype(MakeEGridDescriptor_G_M_N({}, {}));
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t batch_stride_A,
+                                       index_t batch_stride_B,
+                                       DsGridDesc_G_M_N ds_grid_desc_g_m_n,
+                                       EGridDesc_G_M_N e_grid_desc_g_m_n)
+            : batch_stride_A_(batch_stride_A),
+              batch_stride_B_(batch_stride_B),
+              ds_grid_desc_g_m_n_(ds_grid_desc_g_m_n),
+              e_grid_desc_g_m_n_(e_grid_desc_g_m_n)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return static_cast<long_index_t>(g_idx) * batch_stride_A_;
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return static_cast<long_index_t>(g_idx) * batch_stride_B_;
+        }
+
+        __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+        {
+            std::array<long_index_t, NumDTensor> ds_offset;
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                ds_offset[i] = static_cast<long_index_t>(g_idx) *
+                               ds_grid_desc_g_m_n_[i].CalculateOffset(make_multi_index(1, 0, 0));
+            });
+
+            return ds_offset;
+        }
+
+        __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+        {
+            return static_cast<long_index_t>(g_idx) *
+                   e_grid_desc_g_m_n_.CalculateOffset(make_multi_index(1, 0, 0));
+        }
+
+        private:
+        index_t batch_stride_A_;
+        index_t batch_stride_B_;
+        DsGridDesc_G_M_N ds_grid_desc_g_m_n_;
+        EGridDesc_G_M_N e_grid_desc_g_m_n_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 const std::vector<index_t>& a_gs_ms_ns_lengths,
+                 const std::vector<index_t>& a_gs_ms_ks_strides,
+                 const std::vector<index_t>& b_gs_ns_ks_lengths,
+                 const std::vector<index_t>& b_gs_ns_ks_strides,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                 const std::vector<index_t>& e_gs_ms_ns_lengths,
+                 const std::vector<index_t>& e_gs_ms_ns_strides,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              a_grid_desc_m_k_{
+                  DeviceOp::MakeAGridDescriptor_M_K(a_gs_ms_ns_lengths, a_gs_ms_ks_strides)},
+              b_grid_desc_n_k_{
+                  DeviceOp::MakeBGridDescriptor_N_K(b_gs_ns_ks_lengths, b_gs_ns_ks_strides)},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{
+                  DeviceOp::MakeEGridDescriptor_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides)},
+              ds_grid_desc_g_m_n_{
+                  DeviceOp::MakeDsGridDescriptor_G_M_N(ds_gs_ms_ns_lengths, ds_gs_ms_ns_strides)},
+              e_grid_desc_g_m_n_{
+                  DeviceOp::MakeEGridDescriptor_G_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides)},
+              a_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+              b_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              a_mz_stride_{},
+              a_kz_stride_{},
+              b_nz_stride_{},
+              b_kz_stride_{},
+              ds_nz_stride_{},
+              e_nz_stride_{},
+              a_batch_stride_{a_gs_ms_ks_strides[NumDimG - 1]},
+              b_batch_stride_{b_gs_ns_ks_strides[NumDimG - 1]},
+              compute_ptr_offset_of_batch_{
+                  a_batch_stride_, b_batch_stride_, ds_grid_desc_g_m_n_, e_grid_desc_g_m_n_}
+        {
+            static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0, "");
+
+            // populate pointer, batch stride, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+
+                // D desc
+                ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N(ds_gs_ms_ns_lengths[i],
+                                                                         ds_gs_ms_ns_strides[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+            }
+
+            // for sanity check of vector memory access
+            a_mz_stride_ = a_gs_ms_ks_strides[NumDimG + NumDimM - 1];
+            a_kz_stride_ = a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1];
+            b_nz_stride_ = b_gs_ns_ks_strides[NumDimG + NumDimN - 1];
+            b_kz_stride_ = b_gs_ns_ks_strides[NumDimG + NumDimN + NumDimK - 1];
+
+            for(index_t i = 0; i < NumDTensor; ++i)
+            {
+                ds_nz_stride_[i] = ds_gs_ms_ns_strides[i][NumDimG + NumDimM + NumDimN - 1];
+            }
+
+            e_nz_stride_ = e_gs_ms_ns_strides[NumDimG + NumDimM + NumDimN - 1];
+        }
+
+        void Print() const
+        {
+            std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
+            std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl;
+            static_for<0, NumDTensor, 1>{}(
+                [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
+            std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        DsGridDesc_G_M_N ds_grid_desc_g_m_n_;
+        EGridDesc_G_M_N e_grid_desc_g_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+
+        // Strides for the last M/N/K dimensions of A/B/Ds/E
+        //   for sanity check of vector load/store
+        index_t a_mz_stride_;
+        index_t a_kz_stride_;
+        index_t b_nz_stride_;
+        index_t b_kz_stride_;
+        std::array<index_t, NumDTensor> ds_nz_stride_;
+        index_t e_mz_stride_;
+        index_t e_nz_stride_;
+
+        index_t a_batch_stride_;
+        index_t b_batch_stride_;
+
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                            arg.b_grid_desc_n_k_,
+                                            arg.ds_grid_desc_m_n_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting");
+            }
+
+            const index_t G = arg.e_grid_desc_g_m_n_.GetLength(I0);
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * G;
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_contraction_multiple_d_xdl_cshuffle<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    ComputePtrOffsetOfStridedBatch,
+                    DeviceOp::Block2ETileMap,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              G,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.compute_ptr_offset_of_batch_,
+                                              arg.block_2_etile_map_);
+            };
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                        arg.b_grid_desc_n_k_,
+                                        arg.ds_grid_desc_m_n_,
+                                        arg.e_grid_desc_m_n_,
+                                        arg.block_2_etile_map_))
+        {
+            return false;
+        }
+
+        // check vector access
+        static_assert((ABlockTransferSrcVectorDim == 1 || ABlockTransferSrcVectorDim == 2) &&
+                          (BBlockTransferSrcVectorDim == 1 || BBlockTransferSrcVectorDim == 2),
+                      "wrong!");
+
+        // vector memory access of A: could be on M or AK1 dimension
+        if constexpr(ABlockTransferSrcVectorDim == 1)
+        {
+            if(!(arg.a_mz_stride_ == 1 &&
+                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(!(arg.a_kz_stride_ == 1 &&
+                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+
+        // vector memory access of B: could be on N or BK1 dimension
+        if constexpr(BBlockTransferSrcVectorDim == 1)
+        {
+            if(!(arg.b_nz_stride_ == 1 &&
+                 arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(!(arg.b_kz_stride_ == 1 &&
+                 arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+
+        // vector memory access of Ds: always on NPerBlock dimension
+        bool valid_d_access = true;
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            if(!(arg.ds_nz_stride_[i] == 1 &&
+                 arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
+                         CDEBlockTransferScalarPerVector_NPerBlock ==
+                     0))
+            {
+                valid_d_access = false;
+            }
+        });
+
+        if(valid_d_access == false)
+        {
+            return false;
+        }
+
+        // vector memory access of E: always on NPerBlock dimension
+        if(!((arg.e_nz_stride_ == 1 &&
+              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
+                      CDEBlockTransferScalarPerVector_NPerBlock ==
+                  0) ||
+             CDEBlockTransferScalarPerVector_NPerBlock == 1))
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto
+    MakeArgument(const void* p_a,
+                 const void* p_b,
+                 std::array<const void*, NumDTensor> p_ds,
+                 void* p_e,
+                 const std::vector<index_t>& a_gs_ms_ns_lengths,
+                 const std::vector<index_t>& a_gs_ms_ks_strides,
+                 const std::vector<index_t>& b_gs_ns_ks_lengths,
+                 const std::vector<index_t>& b_gs_ns_ks_strides,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                 const std::vector<index_t>& e_gs_ms_ns_lengths,
+                 const std::vector<index_t>& e_gs_ms_ns_strides,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        a_gs_ms_ns_lengths,
+                        a_gs_ms_ks_strides,
+                        b_gs_ns_ks_lengths,
+                        b_gs_ns_ks_strides,
+                        ds_gs_ms_ns_lengths,
+                        ds_gs_ms_ns_strides,
+                        e_gs_ms_ns_lengths,
+                        e_gs_ms_ns_strides,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::vector<index_t>& a_gs_ms_ns_lengths,
+                        const std::vector<index_t>& a_gs_ms_ks_strides,
+                        const std::vector<index_t>& b_gs_ns_ks_lengths,
+                        const std::vector<index_t>& b_gs_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                        const std::vector<index_t>& e_gs_ms_ns_lengths,
+                        const std::vector<index_t>& e_gs_ms_ns_strides,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          a_gs_ms_ns_lengths,
+                                          a_gs_ms_ks_strides,
+                                          b_gs_ns_ks_lengths,
+                                          b_gs_ns_ks_strides,
+                                          ds_gs_ms_ns_lengths,
+                                          ds_gs_ms_ns_strides,
+                                          e_gs_ms_ns_lengths,
+                                          e_gs_ms_ns_strides,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedContractionMultipleD_Xdl_CShuffle"
+            << "<"
+            << NumDimG << ", "
+            << NumDimM << ", "
+            << NumDimN << ", "
+            << NumDimK << ", "
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << ABlockTransferSrcVectorDim << ", "
+            << BBlockTransferSrcVectorDim
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceBatchedGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        ck::index_t BatchStrideA,
+                        ck::index_t BatchStrideB,
+                        ck::index_t BatchStrideC,
+                        ck::index_t Batch,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceBatchedGemmPtr = std::unique_ptr<DeviceBatchedGemm<ALayout,
+                                                               BLayout,
+                                                               CLayout,
+                                                               ADataType,
+                                                               BDataType,
+                                                               CDataType,
+                                                               AElementwiseOperation,
+                                                               BElementwiseOperation,
+                                                               CElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp
+#pragma once
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+struct BatchedGemmEPermuteDesc
+{
+    ck::index_t G0_, G1_, M_, N_;
+    ck::index_t stride_G0_, stride_G1_, stride_M_, stride_N_;
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename DELayout,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedGemmEPermute : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t stride_A,
+                        index_t stride_B,
+                        index_t batch_stride_A,
+                        index_t batch_stride_B,
+                        BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
+                        index_t BatchCount,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceBatchedGemmGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b0,
+                        const void* p_b1,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t O,
+                        ck::index_t Batch,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB0,
+                        ck::index_t StrideB1,
+                        ck::index_t StrideC,
+                        ck::index_t BatchStrideA,
+                        ck::index_t BatchStrideB0,
+                        ck::index_t BatchStrideB1,
+                        ck::index_t BatchStrideC,
+                        AElementwiseOperation a_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        Acc0ElementwiseOperation acc0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename B1GridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2CTileMap,
+          typename ComputeBasePtrOfStridedBatch,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_gemm_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatAB* __restrict__ p_b1_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const AccElementwiseOperation acc_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CElementwiseOperation c_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2CTileMap block_2_ctile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_b1_grid + b1_batch_offset,
+                                                  p_c_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  acc_element_op,
+                                                  b1_element_op,
+                                                  c_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  b1_grid_desc_bk0_n_bk1,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_b1_grid;
+    ignore = p_c_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = acc_element_op;
+    ignore = b1_element_op;
+    ignore = c_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = b1_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_ctile_map;
+    ignore = batch_count;
+    ignore = compute_base_ptr_of_batch;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+// Computes C = A * B0 * B1
+//              ^^^^^^ (Acc0)
+//              ^^^^^^^^^^^ (Acc1)
+template <typename ALayout,
+          typename BLayout, // B0Layout
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename B1DataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock, // Gemm0NPerBlock
+          index_t KPerBlock, // Gemm0KPerBlock
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t B1K1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1BlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = LoopScheduler::Default>
+struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout,
+                                                                         BLayout,
+                                                                         B1Layout,
+                                                                         CLayout,
+                                                                         ADataType,
+                                                                         BDataType,
+                                                                         B1DataType,
+                                                                         CDataType,
+                                                                         AElementwiseOperation,
+                                                                         BElementwiseOperation,
+                                                                         AccElementwiseOperation,
+                                                                         B1ElementwiseOperation,
+                                                                         CElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedGemmGemm_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto matrix_padder =
+        GemmGemmPadder<GemmSpec, index_t, index_t, index_t, index_t>{
+            MPerBlock, NPerBlock, KPerBlock, Gemm1NPerBlock};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto a_grid_desc_m_k = matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+
+        const auto AK0 = K / AK1;
+
+        return transform_tensor_descriptor(a_grid_desc_m_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                      make_pass_through_transform(M)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto b_grid_desc_n_k = matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = b_grid_desc_n_k.GetLength(I1);
+
+        const auto BK0 = K / BK1;
+
+        return transform_tensor_descriptor(b_grid_desc_n_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                      make_pass_through_transform(N)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // Args: Gemm1KRaw, Gemm1NRaw, StrideB1
+    static auto MakeB1GridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b1_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto b1_grid_desc_n_k = matrix_padder.PadB1Descriptor_N_K(b1_grid_desc_nraw_kraw);
+
+        const auto N = b1_grid_desc_n_k.GetLength(I0);
+        const auto K = b1_grid_desc_n_k.GetLength(I1);
+
+        const auto B1K0 = K / B1K1;
+
+        return transform_tensor_descriptor(
+            b1_grid_desc_n_k,
+            make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)),
+                       make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(c_grid_desc_mraw_nraw);
+    }
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
+                                     index_t BatchStrideB,
+                                     index_t BatchStrideB1,
+                                     index_t BatchStrideC)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideB1_(BatchStrideB1),
+              BatchStrideC_(BatchStrideC)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB1_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideB1_;
+        index_t BatchStrideC_;
+    };
+
+    using AGridDesc_AK0_M_AK1  = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1  = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using B1GridDesc_BK0_N_BK1 = decltype(MakeB1GridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N        = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseBatchedGemmGemm_Xdl_CShuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        AccElementwiseOperation,
+        B1ElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        B1GridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        Gemm1NPerBlock,
+        Gemm1KPerBlock,
+        AK1,
+        BK1,
+        B1K1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        Gemm1NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        true,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        true,
+        BBlockLdsExtraN,
+        B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B1BlockTransferThreadClusterArrangeOrder,
+        B1BlockTransferSrcAccessOrder,
+        B1BlockTransferSrcVectorDim,
+        B1BlockTransferSrcScalarPerVector,
+        B1BlockTransferDstScalarPerVector_BK1,
+        false,
+        B1BlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 const B1DataType* p_b1_grid,
+                 CDataType* p_c_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t Gemm1NRaw, // = ORaw
+                 index_t Batch,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideB1,
+                 index_t StrideC,
+                 index_t BatchStrideA,
+                 index_t BatchStrideB,
+                 index_t BatchStrideB1,
+                 index_t BatchStrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 AccElementwiseOperation acc_element_op,
+                 B1ElementwiseOperation b1_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_b1_grid_{p_b1_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              b1_grid_desc_bk0_n_bk1_{
+                  DeviceOp::MakeB1GridDescriptor_BK0_N_BK1(NRaw, Gemm1NRaw, StrideB1)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, Gemm1NRaw, StrideC)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              acc_element_op_{acc_element_op},
+              b1_element_op_{b1_element_op},
+              c_element_op_{c_element_op},
+              batch_count_(Batch),
+              compute_base_ptr_of_batch_{BatchStrideA, BatchStrideB, BatchStrideB1, BatchStrideC},
+              raw_lengths_m_n_k_o_{MRaw, NRaw, KRaw, Gemm1NRaw}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           b1_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "A[AK0, M, AK1]: " << a_grid_desc_ak0_m_ak1_ << std::endl;
+            std::cout << "B0[BK0, N, BK1]: " << b_grid_desc_bk0_n_bk1_ << std::endl;
+            std::cout << "B1[BK0, N, BK1]: " << b1_grid_desc_bk0_n_bk1_ << std::endl;
+            std::cout << "C[M, N]: " << c_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        const B1DataType* p_b1_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        AccElementwiseOperation acc_element_op_;
+        B1ElementwiseOperation b1_element_op_;
+        CElementwiseOperation c_element_op_;
+        index_t batch_count_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+
+        // For robust IsSupportedArgument() check
+        std::vector<index_t> raw_lengths_m_n_k_o_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!DeviceOp::IsSupportedArgument(arg))
+            {
+                throw std::runtime_error("wrong! unsupported argument");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_;
+
+            // Gemm0_K
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel = kernel_gemm_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    AccElementwiseOperation,
+                    B1ElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    DeviceOp::B1GridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    ComputeBasePtrOfStridedBatch,
+                    has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_b1_grid_,
+                                              arg.p_c_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.acc_element_op_,
+                                              arg.b1_element_op_,
+                                              arg.c_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.b1_grid_desc_bk0_n_bk1_,
+                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_ctile_map_,
+                                              arg.batch_count_,
+                                              arg.compute_base_ptr_of_batch_);
+            };
+
+            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need
+            // to concern Gemm0's loop
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
+        // vector is out of bounds
+        const auto MRaw      = arg.raw_lengths_m_n_k_o_[0];
+        const auto NRaw      = arg.raw_lengths_m_n_k_o_[1];
+        const auto KRaw      = arg.raw_lengths_m_n_k_o_[2];
+        const auto Gemm1NRaw = arg.raw_lengths_m_n_k_o_[3];
+
+        // Check scalar per vector requirement
+        const auto a_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, ALayout> ? KRaw : MRaw;
+        const auto b_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, BLayout> ? NRaw : KRaw;
+        const auto b1_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, B1Layout> ? Gemm1NRaw : NRaw;
+        const auto c_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, CLayout> ? Gemm1NRaw : MRaw;
+
+        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
+             b_extent_lowest % BBlockTransferSrcScalarPerVector == 0 &&
+             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
+             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.b1_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             const B1DataType* p_b1,
+                             CDataType* p_c,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t Gemm1NRaw,
+                             index_t Batch,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideB1,
+                             index_t StrideC,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             index_t BatchStrideB1,
+                             index_t BatchStrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             AccElementwiseOperation acc_element_op,
+                             B1ElementwiseOperation b1_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,           p_b,          p_b1,         p_c,          MRaw,
+                        NRaw,          KRaw,         Gemm1NRaw,    Batch,        StrideA,
+                        StrideB,       StrideB1,     StrideC,      BatchStrideA, BatchStrideB,
+                        BatchStrideB1, BatchStrideC, a_element_op, b_element_op, acc_element_op,
+                        b1_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      const void* p_b1,
+                                                      void* p_c,
+                                                      index_t MRaw,
+                                                      index_t NRaw,
+                                                      index_t KRaw,
+                                                      index_t Gemm1NRaw,
+                                                      index_t Batch,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideB1,
+                                                      index_t StrideC,
+                                                      index_t BatchStrideA,
+                                                      index_t BatchStrideB,
+                                                      index_t BatchStrideB1,
+                                                      index_t BatchStrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      AccElementwiseOperation acc_element_op,
+                                                      B1ElementwiseOperation b1_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const B1DataType*>(p_b1),
+                                          static_cast<CDataType*>(p_c),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          Gemm1NRaw,
+                                          Batch,
+                                          StrideA,
+                                          StrideB,
+                                          StrideB1,
+                                          StrideC,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideB1,
+                                          BatchStrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          acc_element_op,
+                                          b1_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmGemm_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerBlock << ", "
+            << Gemm1NPerBlock << ", "
+            << Gemm1KPerBlock << ", "
+            << B1K1 << ", "
+            << getGemmSpecializationString(GemmSpec) << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedGemmMultiD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static_assert(DsLayout::Size() == DsDataType::Size(), "wrong! inconsisiten NumDTensor");
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const std::array<const void*, NumDTensor>& p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t Batch,
+                        index_t StrideA,
+                        index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor>& StrideDs,
+                        index_t StrideE,
+                        index_t BatchStrideA,
+                        index_t BatchStrideB,
+                        const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                        index_t BatchStrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
+ * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
+ * pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename DsPointer,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ComputePtrOffsetOfBatch,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_xdl(const ABDataType* __restrict__ p_a_grid,
+                                const ABDataType* __restrict__ p_b_grid,
+                                DsPointer p_ds_grid,
+                                EDataType* __restrict__ p_e_grid,
+                                const index_t batch_count,
+                                const AElementwiseOperation a_element_op,
+                                const BElementwiseOperation b_element_op,
+                                const CDEElementwiseOperation cde_element_op,
+                                const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
+                                const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
+                                const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                    ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                    e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+                                const Block2ETileMap block_2_etile_map)
+{
+
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    DsPointer p_ds_grid_grp;
+
+    static constexpr index_t NumDTensor =
+        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_ds_grid_grp,
+                                                  p_e_grid + e_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_k0_m_k1,
+                                                  b_grid_desc_k0_n_k1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                                  block_2_etile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = batch_count;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock_;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_etile_map;
+#endif
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
+                                                                    BLayout,
+                                                                    DsLayout,
+                                                                    ELayout,
+                                                                    ADataType,
+                                                                    BDataType,
+                                                                    DsDataType,
+                                                                    EDataType,
+                                                                    AElementwiseOperation,
+                                                                    BElementwiseOperation,
+                                                                    CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedGemmMultiD_Xdl;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+    }
+
+    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+    }
+
+    template <typename ELay>
+    static auto MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
+    {
+        const auto e_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+    }
+
+    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
+                                         const std::array<index_t, NumDTensor>& NRaws,
+                                         const std::array<index_t, NumDTensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using AGridDesc_M_K  = decltype(MakeAGridDescriptor_M_K(1, 1, 1));
+    using BGridDesc_N_K  = decltype(MakeBGridDescriptor_N_K(1, 1, 1));
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({}, {}, {}))>;
+    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N<ELayout>(1, 1, 1));
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                       index_t BatchStrideB,
+                                       std::array<ck::index_t, NumDTensor> BatchStrideDs,
+                                       index_t BatchStrideE)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideDs_(BatchStrideDs),
+              BatchStrideE_(BatchStrideE)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+        {
+            std::array<long_index_t, NumDTensor> ds_offset;
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                ds_offset[i] = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]);
+            });
+            return ds_offset;
+        }
+
+        __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideE_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        std::array<ck::index_t, NumDTensor> BatchStrideDs_;
+        index_t BatchStrideE_;
+    };
+
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t Batch,
+                 index_t StrideA,
+                 index_t StrideB,
+                 const std::array<ck::index_t, NumDTensor>& StrideDs,
+                 index_t StrideE,
+                 index_t BatchStrideA,
+                 index_t BatchStrideB,
+                 const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                 index_t BatchStrideE,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              Batch_(Batch),
+              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(MRaw, KRaw, StrideA)},
+              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(KRaw, NRaw, StrideB)},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N<ELayout>(MRaw, NRaw, StrideE)},
+              a_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+              b_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              compute_ptr_offset_of_batch_{BatchStrideA, BatchStrideB, BatchStrideDs, BatchStrideE},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+
+                // D desc
+                ds_grid_desc_m_n_(i) =
+                    DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaw, NRaw, StrideDs[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
+            std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl;
+            static_for<0, NumDTensor, 1>{}(
+                [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
+            std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // Batch
+        index_t Batch_;
+
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // for calculating batch offset
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceBatchedGemmMultiD_Xdl::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                            arg.b_grid_desc_n_k_,
+                                            arg.ds_grid_desc_m_n_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.Batch_;
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel =
+                    kernel_batched_gemm_xdl<GridwiseGemm,
+                                            ADataType, // TODO: distiguish A/B datatype
+                                            typename GridwiseGemm::DsGridPointer,
+                                            EDataType,
+                                            AElementwiseOperation,
+                                            BElementwiseOperation,
+                                            CDEElementwiseOperation,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch,
+                                            Block2ETileMap,
+                                            has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.Batch_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.compute_ptr_offset_of_batch_,
+                                              arg.block_2_etile_map_);
+            };
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                           arg.b_grid_desc_n_k_,
+                                           arg.ds_grid_desc_m_n_,
+                                           arg.e_grid_desc_m_n_,
+                                           arg.block_2_etile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             const std::array<const void*, NumDTensor>& p_ds,
+                             void* p_e,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t Batch,
+                             index_t StrideA,
+                             index_t StrideB,
+                             const std::array<index_t, NumDTensor>& StrideDs,
+                             index_t StrideE,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                             index_t BatchStrideE,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        M,
+                        N,
+                        K,
+                        Batch,
+                        StrideA,
+                        StrideB,
+                        StrideDs,
+                        StrideE,
+                        BatchStrideA,
+                        BatchStrideB,
+                        BatchStrideDs,
+                        BatchStrideE,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const std::array<const void*, NumDTensor>& p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t Batch,
+                        index_t StrideA,
+                        index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor>& StrideDs,
+                        index_t StrideE,
+                        index_t BatchStrideA,
+                        index_t BatchStrideB,
+                        const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                        index_t BatchStrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          M,
+                                          N,
+                                          K,
+                                          Batch,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideE,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideDs,
+                                          BatchStrideE,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmMultiD_Xdl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock
+            << AK1 << ", "
+            << BK1 << ", "
+            << getGemmSpecializationString(GemmSpec)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename A0Layout,
+          typename B0Layout,
+          typename D0sLayout,
+          typename B1Layout,
+          typename D1sLayout,
+          typename E1Layout,
+          typename A0DataType,
+          typename B0DataType,
+          typename D0sDataType,
+          typename B1DataType,
+          typename D1sDataType,
+          typename E1DataType,
+          typename A0ElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename CDE0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CDE1ElementwiseOperation>
+struct DeviceBatchedGemmMultipleDGemmMultipleD : public BaseOperator
+{
+    static constexpr index_t NumD0Tensor = D0sDataType::Size();
+    static constexpr index_t NumD1Tensor = D1sDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a0,
+                        const void* p_b0,
+                        std::array<const void*, NumD0Tensor> p_d0s,
+                        const void* p_b1,
+                        std::array<const void*, NumD1Tensor> p_d1s,
+                        void* p_e1,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t O,
+                        ck::index_t Batch,
+                        ck::index_t StrideA0,
+                        ck::index_t StrideB0,
+                        std::array<ck::index_t, NumD0Tensor> StrideD0s,
+                        ck::index_t StrideB1,
+                        std::array<ck::index_t, NumD1Tensor> StrideD1s,
+                        ck::index_t StrideE1,
+                        ck::index_t BatchStrideA0,
+                        ck::index_t BatchStrideB0,
+                        std::array<ck::index_t, NumD0Tensor> BatchStrideD0s,
+                        ck::index_t BatchStrideB1,
+                        std::array<ck::index_t, NumD1Tensor> BatchStrideD1s,
+                        ck::index_t BatchStrideE1,
+                        A0ElementwiseOperation a0_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        CDE0ElementwiseOperation cde0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CDE1ElementwiseOperation cde1_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename A0B0B1DataType,
+          typename D0sPointer,
+          typename D1sPointer,
+          typename E1DataType,
+          typename A0ElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename CDE0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CDE1ElementwiseOperation,
+          typename A0GridDesc_AK0_M_AK1,
+          typename B0GridDesc_BK0_N_BK1,
+          typename D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5,
+          typename B1GridDesc_BK0_N_BK1,
+          typename D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2E1TileMap,
+          typename ComputeBasePtrOfStridedBatch,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_gemm_xdl_cshuffle_v1(
+            const A0B0B1DataType* __restrict__ p_a0_grid,
+            const A0B0B1DataType* __restrict__ p_b0_grid,
+            D0sPointer p_d0s_grid,
+            const A0B0B1DataType* __restrict__ p_b1_grid,
+            D1sPointer p_d1s_grid,
+            E1DataType* __restrict__ p_e1_grid,
+            const A0ElementwiseOperation a0_element_op,
+            const B0ElementwiseOperation b0_element_op,
+            const CDE0ElementwiseOperation cde0_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CDE1ElementwiseOperation cde1_element_op,
+            const A0GridDesc_AK0_M_AK1 a0_grid_desc_ak0_m_ak1,
+            const B0GridDesc_BK0_N_BK1 b0_grid_desc_bk0_n_bk1,
+            const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
+                d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                d1s_grid_desc_mblock_mperblock_nblock_nperblock,
+            const E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e1_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2E1TileMap block_2_e1tile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+
+    static_for<0, p_d0s_grid.Size(), 1>{}([&](auto In) {
+        const long_index_t d0_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetD0BasePtr(g_idx, In)));
+        p_d0s_grid(In) = p_d0s_grid(In) + d0_batch_offset;
+    });
+
+    static_for<0, p_d1s_grid.Size(), 1>{}([&](auto In) {
+        const long_index_t d1_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetD1BasePtr(g_idx, In)));
+        p_d1s_grid(In) = p_d1s_grid(In) + d1_batch_offset;
+    });
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a0_grid + a_batch_offset,
+                                                  p_b0_grid + b_batch_offset,
+                                                  p_d0s_grid,
+                                                  p_b1_grid + b1_batch_offset,
+                                                  p_d1s_grid,
+                                                  p_e1_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a0_element_op,
+                                                  b0_element_op,
+                                                  cde0_element_op,
+                                                  b1_element_op,
+                                                  cde1_element_op,
+                                                  a0_grid_desc_ak0_m_ak1,
+                                                  b0_grid_desc_bk0_n_bk1,
+                                                  d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+                                                  b1_grid_desc_bk0_n_bk1,
+                                                  d1s_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e1_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_e1tile_map);
+#else
+    ignore = p_a0_grid;
+    ignore = p_b0_grid;
+    ignore = p_d0s_grid;
+    ignore = p_b1_grid;
+    ignore = p_d1s_grid;
+    ignore = p_e1_grid;
+    ignore = a0_element_op;
+    ignore = b0_element_op;
+    ignore = cde0_element_op;
+    ignore = b1_element_op;
+    ignore = cde1_element_op;
+    ignore = a0_grid_desc_ak0_m_ak1;
+    ignore = b0_grid_desc_bk0_n_bk1;
+    ignore = d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5;
+    ignore = b1_grid_desc_bk0_n_bk1;
+    ignore = d1s_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e1_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_e1tile_map;
+    ignore = batch_count;
+    ignore = compute_base_ptr_of_batch;
+#endif
+}
+
+// Computes C = A * B0 * B1
+//              ^^^^^^ (Acc0)
+//              ^^^^^^^^^^^ (Acc1)
+template <typename A0Layout,
+          typename B0Layout, // B0Layout
+          typename D0sLayout,
+          typename B1Layout,
+          typename D1sLayout,
+          typename E1Layout,
+          typename A0DataType,
+          typename B0DataType,
+          typename Acc0DataType,
+          typename D0sDataType,
+          typename B1DataType,
+          typename Acc1DataType,
+          typename C1ShuffleDataType,
+          typename D1sDataType,
+          typename E1DataType,
+          typename A0ElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename CDE0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CDE1ElementwiseOperation,
+          bool PadGemm0M,
+          bool PadGemm0N,
+          bool PadGemm0K,
+          bool PadGemm1N,
+          bool PadGemm1K,
+          index_t NumGemm0KPrefetchStage,
+          index_t BlockSize,
+          index_t Gemm0MPerBlock,
+          index_t Gemm0NPerBlock,
+          index_t Gemm0KPerBlock,
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t A0K1,
+          index_t B0K1,
+          index_t B1K1,
+          index_t Gemm0MPerXdl,
+          index_t Gemm0NPerXdl,
+          index_t Gemm0MXdlPerWave,
+          index_t Gemm0NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename A0BlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename A0BlockTransferThreadClusterArrangeOrder,
+          typename A0BlockTransferSrcAccessOrder,
+          index_t A0BlockTransferSrcVectorDim,
+          index_t A0BlockTransferSrcScalarPerVector,
+          index_t A0BlockTransferDstScalarPerVector_AK1,
+          bool A0BlockLdsExtraM,
+          typename B0BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B0BlockTransferThreadClusterArrangeOrder,
+          typename B0BlockTransferSrcAccessOrder,
+          index_t B0BlockTransferSrcVectorDim,
+          index_t B0BlockTransferSrcScalarPerVector,
+          index_t B0BlockTransferDstScalarPerVector_BK1,
+          bool B0BlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1BlockLdsExtraN,
+          index_t C1ShuffleMXdlPerWavePerShuffle,
+          index_t C1ShuffleGemm0NXdlPerWavePerShuffle,
+          typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = LoopScheduler::Default>
+struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
+    : public DeviceBatchedGemmMultipleDGemmMultipleD<A0Layout,
+                                                     B0Layout,
+                                                     D0sLayout,
+                                                     B1Layout,
+                                                     D1sLayout,
+                                                     E1Layout,
+                                                     A0DataType,
+                                                     B0DataType,
+                                                     D0sDataType,
+                                                     B1DataType,
+                                                     D1sDataType,
+                                                     E1DataType,
+                                                     A0ElementwiseOperation,
+                                                     B0ElementwiseOperation,
+                                                     CDE0ElementwiseOperation,
+                                                     B1ElementwiseOperation,
+                                                     CDE1ElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle;
+
+    static constexpr index_t NumD0Tensor = D0sDataType::Size();
+    static constexpr index_t NumD1Tensor = D1sDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+    static constexpr auto I8 = Number<8>{};
+    static constexpr auto I9 = Number<9>{};
+
+    static constexpr auto gemm0_padder =
+        GemmPadder_v2<PadGemm0M, PadGemm0N, PadGemm0K, index_t, index_t, index_t>{
+            Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock};
+
+    static constexpr auto gemm1_padder =
+        GemmPadder_v2<PadGemm0M, PadGemm1N, PadGemm1K, index_t, index_t, index_t>{
+            Gemm0MPerBlock, Gemm1NPerBlock, Gemm1KPerBlock};
+
+    // for Gemm0
+    static auto MakeA0GridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA0)
+    {
+        const auto a0_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, A0Layout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA0, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, A0Layout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA0));
+            }
+        }();
+
+        return gemm0_padder.PadADescriptor_M_K(a0_grid_desc_mraw_kraw);
+    }
+
+    // for Gemm0
+    static auto MakeB0GridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b0_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, B0Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, B0Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return gemm0_padder.PadBDescriptor_N_K(b0_grid_desc_nraw_kraw);
+    }
+
+    // for Gemm0
+    template <typename DLay>
+    static auto MakeD0GridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideD0)
+    {
+        const auto d0_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, DLay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideD0, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, DLay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideD0));
+            }
+        }();
+
+        return gemm0_padder.PadCDescriptor_M_N(d0_grid_desc_mraw_nraw);
+    }
+
+    // for Gemm1
+    static auto MakeB1GridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b1_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return gemm1_padder.PadBDescriptor_N_K(b1_grid_desc_nraw_kraw);
+    }
+
+    // for Gemm1
+    template <typename ELay>
+    static auto MakeE1GridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE1)
+    {
+        const auto e1_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE1, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE1));
+            }
+        }();
+
+        return gemm1_padder.PadCDescriptor_M_N(e1_grid_desc_mraw_nraw);
+    }
+
+    static auto MakeD0sGridDescriptor_M_N(const std::array<index_t, NumD1Tensor>& MRaws,
+                                          const std::array<index_t, NumD1Tensor>& NRaws,
+                                          const std::array<index_t, NumD1Tensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, D0sLayout>>;
+
+                return DeviceOp::MakeD0GridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumD0Tensor>{});
+    }
+
+    static auto MakeD1sGridDescriptor_M_N(const std::array<index_t, NumD1Tensor>& MRaws,
+                                          const std::array<index_t, NumD1Tensor>& NRaws,
+                                          const std::array<index_t, NumD1Tensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, D1sLayout>>;
+
+                return DeviceOp::MakeE1GridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumD1Tensor>{});
+    }
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA0,
+                                     index_t BatchStrideB0,
+                                     std::array<index_t, NumD0Tensor> BatchStrideD0s,
+                                     index_t BatchStrideB1,
+                                     std::array<index_t, NumD1Tensor> BatchStrideD1s,
+                                     index_t BatchStrideE1)
+            : BatchStrideA0_(BatchStrideA0),
+              BatchStrideB0_(BatchStrideB0),
+              BatchStrideD0s_(BatchStrideD0s),
+              BatchStrideB1_(BatchStrideB1),
+              BatchStrideD1s_(BatchStrideD1s),
+              BatchStrideE1_(BatchStrideE1)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA0_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB0_);
+        }
+
+        template <index_t I>
+        __host__ __device__ constexpr long_index_t GetD0BasePtr(index_t g_idx,
+                                                                Number<I> d1_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideD0s_[d1_idx]);
+        }
+
+        __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB1_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideE1_);
+        }
+
+        template <index_t I>
+        __host__ __device__ constexpr auto GetD1BasePtr(index_t g_idx, Number<I> d1_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideD1s_[d1_idx]);
+        }
+
+        private:
+        index_t BatchStrideA0_;
+        index_t BatchStrideB0_;
+        std::array<index_t, NumD0Tensor> BatchStrideD0s_;
+        index_t BatchStrideB1_;
+        std::array<index_t, NumD1Tensor> BatchStrideD1s_;
+        index_t BatchStrideE1_;
+    };
+
+    using A0GridDesc_M_K  = decltype(MakeA0GridDescriptor_M_K(1, 1, 1));
+    using B0GridDesc_N_K  = decltype(MakeB0GridDescriptor_N_K(1, 1, 1));
+    using D0sGridDesc_M_N = remove_cvref_t<decltype(MakeD0sGridDescriptor_M_N({}, {}, {}))>;
+    using B1GridDesc_N_K  = decltype(MakeB1GridDescriptor_N_K(1, 1, 1));
+    using D1sGridDesc_M_N = remove_cvref_t<decltype(MakeD1sGridDescriptor_M_N({}, {}, {}))>;
+    using E1GridDesc_M_N  = decltype(MakeE1GridDescriptor_M_N<E1Layout>(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<
+        A0DataType, // TODO: distinguish A/B datatype
+        Acc0DataType,
+        D0sDataType,
+        Acc1DataType,
+        C1ShuffleDataType,
+        D1sDataType,
+        E1DataType,
+        A0ElementwiseOperation,
+        B0ElementwiseOperation,
+        CDE0ElementwiseOperation,
+        B1ElementwiseOperation,
+        CDE1ElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        A0GridDesc_M_K,
+        B0GridDesc_N_K,
+        D0sGridDesc_M_N,
+        B1GridDesc_N_K,
+        D1sGridDesc_M_N,
+        E1GridDesc_M_N,
+        NumGemm0KPrefetchStage,
+        BlockSize,
+        Gemm0MPerBlock,
+        Gemm0NPerBlock,
+        Gemm0KPerBlock,
+        Gemm1NPerBlock,
+        Gemm1KPerBlock,
+        A0K1,
+        B0K1,
+        B1K1,
+        Gemm0MPerXdl,
+        Gemm0NPerXdl,
+        Gemm0MXdlPerWave,
+        Gemm0NXdlPerWave,
+        Gemm1NXdlPerWave,
+        A0BlockTransferThreadClusterLengths_AK0_M_AK1,
+        A0BlockTransferThreadClusterArrangeOrder,
+        A0BlockTransferSrcAccessOrder,
+        A0BlockTransferSrcVectorDim,
+        A0BlockTransferSrcScalarPerVector,
+        A0BlockTransferDstScalarPerVector_AK1,
+        true,
+        A0BlockLdsExtraM,
+        B0BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B0BlockTransferThreadClusterArrangeOrder,
+        B0BlockTransferSrcAccessOrder,
+        B0BlockTransferSrcVectorDim,
+        B0BlockTransferSrcScalarPerVector,
+        B0BlockTransferDstScalarPerVector_BK1,
+        true,
+        B0BlockLdsExtraN,
+        B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B1BlockTransferThreadClusterArrangeOrder,
+        B1BlockTransferSrcAccessOrder,
+        B1BlockTransferSrcVectorDim,
+        B1BlockTransferSrcScalarPerVector,
+        B1BlockTransferDstScalarPerVector_BK1,
+        false,
+        B1BlockLdsExtraN,
+        C1ShuffleMXdlPerWavePerShuffle,
+        C1ShuffleGemm0NXdlPerWavePerShuffle,
+        CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDE1ShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    using A0GridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultA0GridDescriptor_AK0_M_AK1(A0GridDesc_M_K{}))>;
+    using B0GridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultB0GridDescriptor_BK0_N_BK1(B0GridDesc_N_K{}))>;
+    using B1GridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultB1GridDescriptor_BK0_N_BK1(B1GridDesc_N_K{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const A0DataType* p_a0_grid,
+                 const B0DataType* p_b0_grid,
+                 std::array<const void*, NumD0Tensor> p_d0s_grid,
+                 const B1DataType* p_b1_grid,
+                 std::array<const void*, NumD1Tensor> p_d1s_grid,
+                 E1DataType* p_e1_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t Gemm1NRaw, // = ORaw
+                 index_t Batch,
+                 index_t StrideA0,
+                 index_t StrideB0,
+                 std::array<index_t, NumD0Tensor> StrideD0s,
+                 index_t StrideB1,
+                 std::array<index_t, NumD1Tensor> StrideD1s,
+                 index_t StrideE1,
+                 index_t BatchStrideA0,
+                 index_t BatchStrideB0,
+                 std::array<index_t, NumD0Tensor> BatchStrideD0s,
+                 index_t BatchStrideB1,
+                 std::array<index_t, NumD1Tensor> BatchStrideD1s,
+                 index_t BatchStrideE1,
+                 A0ElementwiseOperation a0_element_op,
+                 B0ElementwiseOperation b0_element_op,
+                 CDE0ElementwiseOperation cde0_element_op,
+                 B1ElementwiseOperation b1_element_op,
+                 CDE1ElementwiseOperation cde1_element_op)
+            : p_a0_grid_{p_a0_grid},
+              p_b0_grid_{p_b0_grid},
+              p_d0s_grid_{},
+              p_b1_grid_{p_b1_grid},
+              p_d1s_grid_{},
+              p_e1_grid_{p_e1_grid},
+              a0_grid_desc_m_k_{DeviceOp::MakeA0GridDescriptor_M_K(MRaw, KRaw, StrideA0)},
+              b0_grid_desc_n_k_{DeviceOp::MakeB0GridDescriptor_N_K(KRaw, NRaw, StrideB0)},
+              d0s_grid_desc_m_n_{},
+              b1_grid_desc_n_k_{DeviceOp::MakeB1GridDescriptor_N_K(NRaw, Gemm1NRaw, StrideB1)},
+              d1s_grid_desc_m_n_{},
+              e1_grid_desc_m_n_{
+                  DeviceOp::MakeE1GridDescriptor_M_N<E1Layout>(MRaw, Gemm1NRaw, StrideE1)},
+              a0_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultA0GridDescriptor_AK0_M_AK1(a0_grid_desc_m_k_)},
+              b0_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultB0GridDescriptor_BK0_N_BK1(b0_grid_desc_n_k_)},
+              d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_{},
+              b1_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultB1GridDescriptor_BK0_N_BK1(b1_grid_desc_n_k_)},
+              d1s_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e1_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_e1tile_map_{GridwiseGemm::MakeDefaultBlock2E1TileMap(e1_grid_desc_m_n_)},
+              a0_element_op_{a0_element_op},
+              b0_element_op_{b0_element_op},
+              cde0_element_op_{cde0_element_op},
+              b1_element_op_{b1_element_op},
+              cde1_element_op_{cde1_element_op},
+              batch_count_(Batch),
+              compute_base_ptr_of_batch_{BatchStrideA0,
+                                         BatchStrideB0,
+                                         BatchStrideD0s,
+                                         BatchStrideB1,
+                                         BatchStrideD1s,
+                                         BatchStrideE1}
+        {
+            std::cout << "a0_grid_desc_m_k_{" << a0_grid_desc_m_k_.GetLength(I0) << ", "
+                      << a0_grid_desc_m_k_.GetLength(I1) << "}" << std::endl;
+            std::cout << "b0_grid_desc_n_k_{" << b0_grid_desc_n_k_.GetLength(I0) << ", "
+                      << b0_grid_desc_n_k_.GetLength(I1) << "}" << std::endl;
+            std::cout << "d0s_grid_desc_m_n_[I0]{" << d0s_grid_desc_m_n_[I0].GetLength(I0) << ", "
+                      << d0s_grid_desc_m_n_[I0].GetLength(I1) << "}" << std::endl;
+            std::cout << "b1_grid_desc_n_k_{" << b1_grid_desc_n_k_.GetLength(I0) << ", "
+                      << b1_grid_desc_n_k_.GetLength(I1) << "}" << std::endl;
+            std::cout << "d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_{"
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I0) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I1) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I2) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I3) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I4) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I5) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I6) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I7) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I8) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I9) << "}"
+                      << std::endl;
+            std::cout << "e1_grid_desc_m_n_{" << e1_grid_desc_m_n_.GetLength(I0) << ", "
+                      << e1_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+
+            static_for<0, NumD0Tensor, 1>{}([&](auto i) {
+                using D0Layout   = remove_cvref_t<tuple_element_t<i.value, D0sLayout>>;
+                using D0DataType = remove_cvref_t<tuple_element_t<i.value, D0sDataType>>;
+
+                // D0 pointer
+                p_d0s_grid_(i) = static_cast<const D0DataType*>(p_d0s_grid[i]);
+
+                // D0 desc
+                d0s_grid_desc_m_n_(i) =
+                    DeviceOp::MakeD0GridDescriptor_M_N<D0Layout>(MRaw, NRaw, StrideD0s[i]);
+            });
+
+            static_for<0, NumD1Tensor, 1>{}([&](auto i) {
+                using D1Layout   = remove_cvref_t<tuple_element_t<i.value, D1sLayout>>;
+                using D1DataType = remove_cvref_t<tuple_element_t<i.value, D1sDataType>>;
+
+                // D1 pointer
+                p_d1s_grid_(i) = static_cast<const D1DataType*>(p_d1s_grid[i]);
+
+                // D1 desc
+                d1s_grid_desc_m_n_(i) =
+                    DeviceOp::MakeE1GridDescriptor_M_N<D1Layout>(MRaw, Gemm1NRaw, StrideD1s[i]);
+            });
+
+            if(GridwiseGemm::CheckValidity(a0_grid_desc_m_k_,
+                                           b0_grid_desc_n_k_,
+                                           b1_grid_desc_n_k_,
+                                           e1_grid_desc_m_n_,
+                                           block_2_e1tile_map_))
+            {
+                e1_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e1_grid_desc_m_n_);
+
+                d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_ =
+                    GridwiseGemm::MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(
+                        d0s_grid_desc_m_n_);
+
+                d1s_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeD1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        d1s_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        // pointers
+        const A0DataType* p_a0_grid_;
+        const B0DataType* p_b0_grid_;
+        typename GridwiseGemm::D0sGridPointer p_d0s_grid_;
+        const B1DataType* p_b1_grid_;
+        typename GridwiseGemm::D1sGridPointer p_d1s_grid_;
+        E1DataType* p_e1_grid_;
+
+        // tensor descriptors for problem definiton
+        A0GridDesc_M_K a0_grid_desc_m_k_;
+        B0GridDesc_N_K b0_grid_desc_n_k_;
+        D0sGridDesc_M_N d0s_grid_desc_m_n_;
+        B1GridDesc_N_K b1_grid_desc_n_k_;
+        D1sGridDesc_M_N d1s_grid_desc_m_n_;
+        E1GridDesc_M_N e1_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        A0GridDesc_AK0_M_AK1 a0_grid_desc_ak0_m_ak1_;
+        B0GridDesc_BK0_N_BK1 b0_grid_desc_bk0_n_bk1_;
+        typename GridwiseGemm::D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
+            d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_;
+        B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
+        typename GridwiseGemm::D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            d1s_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e1_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e1-tile map
+        typename GridwiseGemm::DefaultBlock2E1TileMap block_2_e1tile_map_;
+
+        // element-wise op
+        A0ElementwiseOperation a0_element_op_;
+        B0ElementwiseOperation b0_element_op_;
+        CDE0ElementwiseOperation cde0_element_op_;
+        B1ElementwiseOperation b1_element_op_;
+        CDE1ElementwiseOperation cde1_element_op_;
+
+        // batch
+        index_t batch_count_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a0_grid_desc_m_k_,
+                                            arg.b0_grid_desc_n_k_,
+                                            arg.b1_grid_desc_n_k_,
+                                            arg.e1_grid_desc_m_n_,
+                                            arg.block_2_e1tile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_e1tile_map_.CalculateGridSize(arg.e1_grid_desc_m_n_) * arg.batch_count_;
+
+            // Gemm0_K
+            const auto K = arg.a0_grid_desc_m_k_.GetLength(I1);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel = kernel_batched_gemm_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    A0DataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::D0sGridPointer,
+                    typename GridwiseGemm::D1sGridPointer,
+                    E1DataType,
+                    A0ElementwiseOperation,
+                    B0ElementwiseOperation,
+                    CDE0ElementwiseOperation,
+                    B1ElementwiseOperation,
+                    CDE1ElementwiseOperation,
+                    DeviceOp::A0GridDesc_AK0_M_AK1,
+                    DeviceOp::B0GridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5,
+                    DeviceOp::B1GridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2E1TileMap,
+                    ComputeBasePtrOfStridedBatch,
+                    has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a0_grid_,
+                                              arg.p_b0_grid_,
+                                              arg.p_d0s_grid_,
+                                              arg.p_b1_grid_,
+                                              arg.p_d1s_grid_,
+                                              arg.p_e1_grid_,
+                                              arg.a0_element_op_,
+                                              arg.b0_element_op_,
+                                              arg.cde0_element_op_,
+                                              arg.b1_element_op_,
+                                              arg.cde1_element_op_,
+                                              arg.a0_grid_desc_ak0_m_ak1_,
+                                              arg.b0_grid_desc_bk0_n_bk1_,
+                                              arg.d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_,
+                                              arg.b1_grid_desc_bk0_n_bk1_,
+                                              arg.d1s_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e1_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_e1tile_map_,
+                                              arg.batch_count_,
+                                              arg.compute_base_ptr_of_batch_);
+            };
+
+            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need
+            // to concern Gemm0's loop
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a0_grid_desc_m_k_,
+                                           arg.b0_grid_desc_n_k_,
+                                           arg.b1_grid_desc_n_k_,
+                                           arg.e1_grid_desc_m_n_,
+                                           arg.block_2_e1tile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const A0DataType* p_a0,
+                             const B0DataType* p_b0,
+                             std::array<const void*, NumD0Tensor> p_d0s,
+                             const B1DataType* p_b1,
+                             std::array<const void*, NumD1Tensor> p_d1s,
+                             E1DataType* p_e1,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t Gemm1NRaw,
+                             index_t Batch,
+                             index_t StrideA0,
+                             index_t StrideB0,
+                             std::array<index_t, NumD0Tensor> StrideD0s,
+                             index_t StrideB1,
+                             std::array<index_t, NumD1Tensor> StrideD1s,
+                             index_t StrideE1,
+                             index_t BatchStrideA0,
+                             index_t BatchStrideB0,
+                             std::array<index_t, NumD0Tensor> BatchStrideD0s,
+                             index_t BatchStrideB1,
+                             std::array<index_t, NumD1Tensor> BatchStrideD1s,
+                             index_t BatchStrideE1,
+                             A0ElementwiseOperation a0_element_op,
+                             B0ElementwiseOperation b0_element_op,
+                             CDE0ElementwiseOperation cde0_element_op,
+                             B1ElementwiseOperation b1_element_op,
+                             CDE1ElementwiseOperation cde1_element_op)
+    {
+        return Argument{p_a0,          p_b0,
+                        p_d0s,         p_b1,
+                        p_d1s,         p_e1,
+                        MRaw,          NRaw,
+                        KRaw,          Gemm1NRaw,
+                        Batch,         StrideA0,
+                        StrideB0,      StrideD0s,
+                        StrideB1,      StrideD1s,
+                        StrideE1,      BatchStrideA0,
+                        BatchStrideB0, BatchStrideD0s,
+                        BatchStrideB1, BatchStrideD1s,
+                        BatchStrideE1, a0_element_op,
+                        b0_element_op, cde0_element_op,
+                        b1_element_op, cde1_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a0,
+                        const void* p_b0,
+                        std::array<const void*, NumD0Tensor> p_d0s,
+                        const void* p_b1,
+                        std::array<const void*, NumD1Tensor> p_d1s,
+                        void* p_e1,
+                        index_t MRaw,
+                        index_t NRaw,
+                        index_t KRaw,
+                        index_t Gemm1NRaw,
+                        index_t Batch,
+                        index_t StrideA0,
+                        index_t StrideB0,
+                        std::array<ck::index_t, NumD0Tensor> StrideD0s,
+                        index_t StrideB1,
+                        std::array<ck::index_t, NumD1Tensor> StrideD1s,
+                        index_t StrideE1,
+                        index_t BatchStrideA0,
+                        index_t BatchStrideB0,
+                        std::array<ck::index_t, NumD0Tensor> BatchStrideD0s,
+                        index_t BatchStrideB1,
+                        std::array<ck::index_t, NumD1Tensor> BatchStrideD1s,
+                        index_t BatchStrideE1,
+                        A0ElementwiseOperation a0_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        CDE0ElementwiseOperation cde0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CDE1ElementwiseOperation cde1_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const A0DataType*>(p_a0),
+                                          static_cast<const B0DataType*>(p_b0),
+                                          p_d0s,
+                                          static_cast<const B1DataType*>(p_b1),
+                                          p_d1s,
+                                          static_cast<E1DataType*>(p_e1),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          Gemm1NRaw,
+                                          Batch,
+                                          StrideA0,
+                                          StrideB0,
+                                          StrideD0s,
+                                          StrideB1,
+                                          StrideD1s,
+                                          StrideE1,
+                                          BatchStrideA0,
+                                          BatchStrideB0,
+                                          BatchStrideD0s,
+                                          BatchStrideB1,
+                                          BatchStrideD1s,
+                                          BatchStrideE1,
+                                          a0_element_op,
+                                          b0_element_op,
+                                          cde0_element_op,
+                                          b1_element_op,
+                                          cde1_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << Gemm0MPerBlock << ", "
+            << Gemm0NPerBlock << ", "
+            << Gemm0KPerBlock << ", "
+            << A0K1 << ", "
+            << B0K1 << ", "
+            << B1K1 << ", "
+            << Gemm0MPerXdl << ", "
+            << Gemm0NPerXdl << ", "
+            << Gemm0MXdlPerWave << ", "
+            << Gemm0NXdlPerWave << ", "
+            << Gemm1NXdlPerWave << "> ";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename ReducePtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ReduceInElementwiseOperations,
+          typename ReduceAccElementwiseOperations,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ReduceGridDescriptor_MBlock_MPerBlock,
+          typename ComputeBasePrtOfBatch,
+          typename Block2CTileMap,
+          bool HasMainK0BlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_reduce_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            ReducePtrsGlobal p_reduces_grid,
+            const index_t batch_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const ReduceInElementwiseOperations reduce_in_element_ops,
+            const ReduceAccElementwiseOperations reduce_out_element_ops,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
+            const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
+
+    static_for<0, p_reduces_grid.Size(), 1>{}([&](auto In) {
+        const long_index_t d_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch_.GetDBasePtr(g_idx, In)));
+        p_reduces_grid(In) = p_reduces_grid(In) + d_batch_offset;
+    });
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid + a_batch_offset,
+                                                   p_b_grid + b_batch_offset,
+                                                   p_c_grid + c_batch_offset,
+                                                   p_reduces_grid,
+                                                   p_shared,
+                                                   a_element_op,
+                                                   b_element_op,
+                                                   c_element_op,
+                                                   reduce_in_element_ops,
+                                                   reduce_out_element_ops,
+                                                   a_grid_desc_ak0_m_ak1,
+                                                   b_grid_desc_bk0_n_bk1,
+                                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                   reduce_grid_desc_mblock_mperblock,
+                                                   block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = p_reduces_grid;
+    ignore = batch_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = reduce_in_element_ops;
+    ignore = reduce_out_element_ops;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = reduce_grid_desc_mblock_mperblock;
+    ignore = compute_base_ptr_of_batch_;
+    ignore = block_2_ctile_map;
+#endif
+}
+
+// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
+// version currently has compiler issues with register spill which further causes validation
+// failures.
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename ReduceAccDataType,
+          typename ReducePtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ReduceOperations,
+          typename ReduceInElementwiseOperations,
+          typename ReduceAccElementwiseOperations,
+          typename ReduceGlobalMemoryDataOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperations::Size()>
+{
+    using DeviceOp = DeviceBatchedGemmReduce_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(M)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_right_pad_transform(MRaw, MPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(N)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_right_pad_transform(NRaw, NPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+
+    // assume D is packed tensor
+    static auto MakeReduceGridDescriptor_M(index_t MRaw)
+    {
+        const auto d_grid_desc_mraw = make_naive_tensor_descriptor_packed(make_tuple(MRaw));
+
+        const auto M    = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto MPad = M - MRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                     GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M
+            return transform_tensor_descriptor(d_grid_desc_mraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad)),
+                                               make_tuple(Sequence<0>{}),
+                                               make_tuple(Sequence<0>{}));
+        }
+        else
+        {
+            // not pad M
+            return d_grid_desc_mraw;
+        }
+    }
+
+    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    using ReduceGridDesc_M    = decltype(MakeReduceGridDescriptor_M(1));
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
+                                     index_t BatchStrideB,
+                                     index_t BatchStrideC,
+                                     index_t BatchStrideD)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideC_(BatchStrideC),
+              BatchStrideD_(BatchStrideD)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        template <index_t I>
+        __host__ __device__ constexpr long_index_t GetDBasePtr(index_t g_idx,
+                                                               Number<I> reduction_idx) const
+        {
+            // TODO - Support sequence of StrideD in MakeArgument()
+            (void)reduction_idx;
+            return g_idx * static_cast<long_index_t>(BatchStrideD_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideC_;
+        index_t BatchStrideD_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        ReduceAccDataType,
+        ReducePtrsGlobal,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        ReduceOperations,
+        ReduceInElementwiseOperations,
+        ReduceAccElementwiseOperations,
+        InMemoryDataOperationEnum::Set,
+        ReduceGlobalMemoryDataOperation,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        ReduceGridDesc_M,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+        CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+        LoopSched>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 ReducePtrsGlobal p_reduces_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op,
+                 ReduceInElementwiseOperations reduce_in_element_ops,
+                 ReduceAccElementwiseOperations reduce_out_element_ops,
+                 index_t Batch)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              p_reduces_grid_{p_reduces_grid},
+              Batch_(Batch),
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
+              reduce_grid_desc_m_{DeviceOp::MakeReduceGridDescriptor_M(MRaw)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              reduce_grid_desc_mblock_mperblock_{},
+              compute_base_ptr_of_batch_{
+                  type_convert<index_t>(a_grid_desc_ak0_m_ak1_.GetElementSpaceSize()),
+                  type_convert<index_t>(b_grid_desc_bk0_n_bk1_.GetElementSpaceSize()),
+                  type_convert<index_t>(c_grid_desc_m_n_.GetElementSpaceSize()),
+                  type_convert<index_t>(reduce_grid_desc_m_.GetElementSpaceSize())},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              reduce_in_element_ops_{reduce_in_element_ops},
+              reduce_out_element_ops_{reduce_out_element_ops}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+
+                reduce_grid_desc_mblock_mperblock_ =
+                    GridwiseGemm::MakeReduceGridDescriptor_MBlock_MPerBlock(reduce_grid_desc_m_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        ReducePtrsGlobal p_reduces_grid_;
+        index_t Batch_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        ReduceGridDesc_M reduce_grid_desc_m_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock
+            reduce_grid_desc_mblock_mperblock_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        ReduceInElementwiseOperations reduce_in_element_ops_;
+        ReduceAccElementwiseOperations reduce_out_element_ops_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+#if 0
+            {
+                std::cout << "arg.Batch_ = " << arg.Batch_ << std::endl;
+
+                std::cout << "arg.a_grid_desc_ak0_m_ak1_{"
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_bk0_n_bk1_{"
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I0) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+
+                std::cout << "arg.reduce_grid_desc_m_{ " << arg.reduce_grid_desc_m_.GetLength(I0) << "}"
+                          << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.Batch_;
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float elapsed_time = 0.0f;
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_batched_gemm_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    ReducePtrsGlobal,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    ReduceInElementwiseOperations,
+                    ReduceAccElementwiseOperations,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock,
+                    ComputeBasePtrOfStridedBatch,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    true>;
+
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_reduces_grid_,
+                                           arg.Batch_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.reduce_in_element_ops_,
+                                           arg.reduce_out_element_ops_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.reduce_grid_desc_mblock_mperblock_,
+                                           arg.compute_base_ptr_of_batch_,
+                                           arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_batched_gemm_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    ReducePtrsGlobal,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    ReduceInElementwiseOperations,
+                    ReduceAccElementwiseOperations,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock,
+                    ComputeBasePtrOfStridedBatch,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    false>;
+
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_reduces_grid_,
+                                           arg.Batch_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.reduce_in_element_ops_,
+                                           arg.reduce_out_element_ops_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.reduce_grid_desc_mblock_mperblock_,
+                                           arg.compute_base_ptr_of_batch_,
+                                           arg.block_2_ctile_map_);
+            }
+
+            return elapsed_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        auto casted_p_arg = dynamic_cast<const Argument*>(p_arg);
+        if(casted_p_arg == nullptr)
+        {
+            return false;
+        }
+        else
+        {
+            return IsSupportedArgument(*casted_p_arg);
+        }
+    }
+
+    static constexpr int NumReduce = ReduceOperations::Size();
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             const void* p_bias,
+                             std::array<const void*, 0> p_ds,
+                             void* p_c,
+                             std::array<void*, NumReduce> p_reduces,
+                             ck::index_t M,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t StrideA,
+                             ck::index_t StrideB,
+                             ck::index_t StrideC,
+                             std::array<ck::index_t, 0> StrideDs,
+                             std::array<void*, 3> gemm_element_ops,
+                             std::array<void*, 0> d_element_ops,
+                             std::array<void*, NumReduce> reduce_in_element_op,
+                             std::array<void*, NumReduce> reduce_out_element_op,
+                             index_t Batch)
+    {
+        (void)p_bias;
+        (void)p_ds;
+        (void)StrideDs;
+        (void)d_element_ops;
+
+        ReducePtrsGlobal reduce_tuple = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReducePtrsGlobal{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return static_cast<T*>(p_reduces[I]);
+            },
+            Number<NumReduce>{});
+
+        ReduceInElementwiseOperations reduce_in_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceInElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_in_element_op[I]));
+            },
+            Number<NumReduce>{});
+        ReduceAccElementwiseOperations reduce_out_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceAccElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_out_element_op[I]));
+            },
+            Number<NumReduce>{});
+
+        AElementwiseOperation a_element_op =
+            *(static_cast<AElementwiseOperation*>(gemm_element_ops[0]));
+        BElementwiseOperation b_element_op =
+            *(static_cast<BElementwiseOperation*>(gemm_element_ops[1]));
+        CElementwiseOperation c_element_op =
+            *(static_cast<CElementwiseOperation*>(gemm_element_ops[2]));
+
+        return Argument{static_cast<const ADataType*>(p_a),
+                        static_cast<const BDataType*>(p_b),
+                        static_cast<CDataType*>(p_c),
+                        reduce_tuple,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op,
+                        reduce_in_element_ops,
+                        reduce_out_element_ops,
+                        Batch};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const void* p_bias,
+                        std::array<const void*, 0> p_ds,
+                        void* p_c,
+                        std::array<void*, NumReduce> p_reduces,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        std::array<ck::index_t, 0> StrideDs,
+                        std::array<void*, 3> gemm_element_ops,
+                        std::array<void*, 0> d_element_ops,
+                        std::array<void*, NumReduce> reduce_in_element_op,
+                        std::array<void*, NumReduce> reduce_out_element_op,
+                        index_t Batch = 1) override
+    {
+        (void)p_bias;
+        (void)p_ds;
+        (void)StrideDs;
+        (void)d_element_ops;
+
+        ReducePtrsGlobal reduce_tuple = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReducePtrsGlobal{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return static_cast<T*>(p_reduces[I]);
+            },
+            Number<NumReduce>{});
+
+        ReduceInElementwiseOperations reduce_in_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceInElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_in_element_op[I]));
+            },
+            Number<NumReduce>{});
+        ReduceAccElementwiseOperations reduce_out_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceAccElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_out_element_op[I]));
+            },
+            Number<NumReduce>{});
+
+        AElementwiseOperation a_element_op =
+            *(static_cast<AElementwiseOperation*>(gemm_element_ops[0]));
+        BElementwiseOperation b_element_op =
+            *(static_cast<BElementwiseOperation*>(gemm_element_ops[1]));
+        CElementwiseOperation c_element_op =
+            *(static_cast<CElementwiseOperation*>(gemm_element_ops[2]));
+
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          reduce_tuple,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op,
+                                          reduce_in_element_ops,
+                                          reduce_out_element_ops,
+                                          Batch);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmReduce_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b0,
+                        const void* p_b1,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t O,
+                        ck::index_t Batch,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB0,
+                        ck::index_t StrideB1,
+                        ck::index_t StrideC,
+                        ck::index_t BatchStrideA,
+                        ck::index_t BatchStrideB0,
+                        ck::index_t BatchStrideB1,
+                        ck::index_t BatchStrideC,
+                        AElementwiseOperation a_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        Acc0ElementwiseOperation acc0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CPermuteNumDims_G_M_Gemm1N, // Sequence<>
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceBatchedGemmSoftmaxGemmPermute : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b0,
+                        const void* p_b1,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t O,
+                        ck::index_t Batch,
+                        std::vector<index_t> c_gs_ms_os_lengths,
+                        std::vector<index_t> c_gs_ms_os_strides,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB0,
+                        ck::index_t StrideB1,
+                        ck::index_t BatchStrideA,
+                        ck::index_t BatchStrideB0,
+                        ck::index_t BatchStrideB1,
+                        AElementwiseOperation a_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        Acc0ElementwiseOperation acc0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename B1GridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2CTileMap,
+          typename ComputeBasePtrOfStridedBatch,
+          typename C0MatrixMask,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatAB* __restrict__ p_b1_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const AccElementwiseOperation acc_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CElementwiseOperation c_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2CTileMap block_2_ctile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
+            const C0MatrixMask c0_matrix_mask)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_b1_grid + b1_batch_offset,
+                                                  p_c_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  acc_element_op,
+                                                  b1_element_op,
+                                                  c_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  b1_grid_desc_bk0_n_bk1,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_ctile_map,
+                                                  c0_matrix_mask);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_b1_grid;
+    ignore = p_c_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = acc_element_op;
+    ignore = b1_element_op;
+    ignore = c_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = b1_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_ctile_map;
+    ignore = batch_count;
+    ignore = compute_base_ptr_of_batch;
+    ignore = c0_matrix_mask;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+// Computes C = A * B0 * B1
+//              ^^^^^^ (Acc0)
+//              ^^^^^^^^^^^ (Acc1)
+template <typename ALayout,
+          typename BLayout, // B0Layout
+          typename B1Layout,
+          typename CPermuteNumDims_G_M_Gemm1N, // Sequence<NumDimG, NumDimM, NumDimGemm1N>
+          typename ADataType,
+          typename BDataType,
+          typename B1DataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock, // Gemm0NPerBlock
+          index_t KPerBlock, // Gemm0KPerBlock
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t B1K1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1BlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          bool MaskOutUpperTriangle,
+          LoopScheduler LoopSched = LoopScheduler::Default>
+struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
+    : public DeviceBatchedGemmSoftmaxGemmPermute<ALayout,
+                                                 BLayout,
+                                                 B1Layout,
+                                                 CPermuteNumDims_G_M_Gemm1N,
+                                                 ADataType,
+                                                 BDataType,
+                                                 B1DataType,
+                                                 CDataType,
+                                                 AElementwiseOperation,
+                                                 BElementwiseOperation,
+                                                 AccElementwiseOperation,
+                                                 B1ElementwiseOperation,
+                                                 CElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto matrix_padder =
+        GemmGemmPadder<GemmSpec, index_t, index_t, index_t, index_t>{
+            MPerBlock, NPerBlock, KPerBlock, Gemm1NPerBlock};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto a_grid_desc_m_k = matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+
+        const auto AK0 = K / AK1;
+
+        return transform_tensor_descriptor(a_grid_desc_m_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                      make_pass_through_transform(M)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto b_grid_desc_n_k = matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = b_grid_desc_n_k.GetLength(I1);
+
+        const auto BK0 = K / BK1;
+
+        return transform_tensor_descriptor(b_grid_desc_n_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                      make_pass_through_transform(N)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // Args: Gemm1KRaw, Gemm1NRaw, StrideB1
+    static auto MakeB1GridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b1_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto b1_grid_desc_n_k = matrix_padder.PadB1Descriptor_N_K(b1_grid_desc_nraw_kraw);
+
+        const auto N = b1_grid_desc_n_k.GetLength(I0);
+        const auto K = b1_grid_desc_n_k.GetLength(I1);
+
+        const auto B1K0 = K / B1K1;
+
+        return transform_tensor_descriptor(
+            b1_grid_desc_n_k,
+            make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)),
+                       make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // assume C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeCGridDescriptor_M_N(const std::vector<index_t>& c_gs_ms_ns_lengths_vec,
+                                        const std::vector<index_t>& c_gs_ms_ns_strides_vec)
+    {
+        constexpr index_t NumDimG = CPermuteNumDims_G_M_Gemm1N::At(I0);
+        constexpr index_t NumDimM = CPermuteNumDims_G_M_Gemm1N::At(I1);
+        constexpr index_t NumDimN = CPermuteNumDims_G_M_Gemm1N::At(I2); // NumDimGemm1N
+
+        assert(c_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
+               c_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto c_ms_ns_lengths = to_tuple(
+            c_gs_ms_ns_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        const auto c_ms_ns_strides = to_tuple(
+            c_gs_ms_ns_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimN, 1>::type{};
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(c_ms_ns_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(c_ms_ns_lengths, nDimIds);
+
+        // naive tensor C[M0, M1, M2, ..., N0, N1, N2...]
+        const auto c_grid_desc_ms_ns =
+            make_naive_tensor_descriptor(c_ms_ns_lengths, c_ms_ns_strides);
+
+        // transformed tensor C[MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * N2 * ...]
+        const auto c_grid_desc_mraw_nraw = transform_tensor_descriptor(
+            c_grid_desc_ms_ns,
+            make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)),
+            make_tuple(mDimIds, nDimIds),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return matrix_padder.PadCDescriptor_M_N(c_grid_desc_mraw_nraw);
+    }
+
+    // assume C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeCGridDescriptor_G_M_N(const std::vector<index_t>& c_gs_ms_ns_lengths_vec,
+                                          const std::vector<index_t>& c_gs_ms_ns_strides_vec)
+    {
+        constexpr index_t NumDimG = CPermuteNumDims_G_M_Gemm1N::At(I0);
+        constexpr index_t NumDimM = CPermuteNumDims_G_M_Gemm1N::At(I1);
+        constexpr index_t NumDimN = CPermuteNumDims_G_M_Gemm1N::At(I2); // NumDimGemm1N
+
+        assert(c_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
+               c_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto c_gs_ms_ns_lengths =
+            to_tuple(c_gs_ms_ns_lengths_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        const auto c_gs_ms_ns_strides =
+            to_tuple(c_gs_ms_ns_strides_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+
+        // dimension Ids for G0, G1, ...
+        constexpr auto gDimIds = typename arithmetic_sequence_gen<0, NumDimG, 1>::type{};
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds =
+            typename arithmetic_sequence_gen<NumDimG, NumDimG + NumDimM, 1>::type{};
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds = typename arithmetic_sequence_gen<NumDimG + NumDimM,
+                                                                  NumDimG + NumDimM + NumDimN,
+                                                                  1>::type{};
+
+        // lengths for G0, G1, ...
+        const auto gLengths = get_container_subset(c_gs_ms_ns_lengths, gDimIds);
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(c_gs_ms_ns_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(c_gs_ms_ns_lengths, nDimIds);
+
+        // naive tensor C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+        const auto c_grid_desc_gs_ms_ns =
+            make_naive_tensor_descriptor(c_gs_ms_ns_lengths, c_gs_ms_ns_strides);
+
+        // transformed tensor C[G = G0 * G1 * ..., MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 *
+        // N2 * ...]
+        const auto c_grid_desc_g_mraw_nraw =
+            transform_tensor_descriptor(c_grid_desc_gs_ms_ns,
+                                        make_tuple(make_merge_transform(gLengths),
+                                                   make_merge_transform(mLengths),
+                                                   make_merge_transform(nLengths)),
+                                        make_tuple(gDimIds, mDimIds, nDimIds),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        // this desc is only for calculating batch offset so no padding needed
+        return c_grid_desc_g_mraw_nraw;
+    }
+
+    using AGridDesc_AK0_M_AK1  = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1  = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using B1GridDesc_BK0_N_BK1 = decltype(MakeB1GridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N        = decltype(MakeCGridDescriptor_M_N({}, {}));
+    using CGridDesc_G_M_N      = decltype(MakeCGridDescriptor_G_M_N({}, {}));
+
+    // to track the points which need to be set to -inf on C0
+    // Note: no need to reset M padding value, because they will not be stored out.
+    struct C0MatrixMask
+    {
+        C0MatrixMask(index_t NRaw) : NRaw_(NRaw) {}
+
+        __host__ __device__ bool IsUpperTriangle(index_t m, index_t n) const { return n > m; }
+
+        __host__ __device__ bool IsNOutOfBound(/*index_t m, */ index_t n) const
+        {
+            return n >= NRaw_;
+        }
+
+        __host__ __device__ bool IsMaskedElement(index_t m, index_t n) const
+        {
+            return IsUpperTriangle(m, n) || IsNOutOfBound(n);
+        }
+
+        private:
+        // index_t MRaw_;
+        index_t NRaw_;
+    };
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
+                                     index_t BatchStrideB,
+                                     index_t BatchStrideB1,
+                                     CGridDesc_G_M_N c_grid_desc_g_m_n)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideB1_(BatchStrideB1),
+              c_grid_desc_g_m_n_(c_grid_desc_g_m_n)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB1_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return c_grid_desc_g_m_n_.CalculateOffset(make_multi_index(g_idx, 0, 0));
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideB1_;
+        CGridDesc_G_M_N c_grid_desc_g_m_n_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        AccElementwiseOperation,
+        B1ElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        B1GridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        Gemm1NPerBlock,
+        Gemm1KPerBlock,
+        AK1,
+        BK1,
+        B1K1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        Gemm1NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        true,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        true,
+        BBlockLdsExtraN,
+        B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B1BlockTransferThreadClusterArrangeOrder,
+        B1BlockTransferSrcAccessOrder,
+        B1BlockTransferSrcVectorDim,
+        B1BlockTransferSrcScalarPerVector,
+        B1BlockTransferDstScalarPerVector_BK1,
+        false,
+        B1BlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched,
+        matrix_padder.PadN,
+        MaskOutUpperTriangle>;
+
+    // Argument
+    // FIXME: constness
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 const B1DataType* p_b1_grid,
+                 CDataType* p_c_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t Gemm1NRaw, // = ORaw
+                 index_t Batch,
+                 std::vector<index_t> c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths
+                 std::vector<index_t> c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideB1,
+                 index_t BatchStrideA,
+                 index_t BatchStrideB,
+                 index_t BatchStrideB1,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 AccElementwiseOperation acc_element_op,
+                 B1ElementwiseOperation b1_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_b1_grid_{p_b1_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              b1_grid_desc_bk0_n_bk1_{
+                  DeviceOp::MakeB1GridDescriptor_BK0_N_BK1(NRaw, Gemm1NRaw, StrideB1)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(c_gs_ms_gemm1ns_lengths,
+                                                                 c_gs_ms_gemm1ns_strides)},
+              c_grid_desc_g_m_n_{DeviceOp::MakeCGridDescriptor_G_M_N(c_gs_ms_gemm1ns_lengths,
+                                                                     c_gs_ms_gemm1ns_strides)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              acc_element_op_{acc_element_op},
+              b1_element_op_{b1_element_op},
+              c_element_op_{c_element_op},
+              batch_count_(Batch),
+              compute_base_ptr_of_batch_{
+                  BatchStrideA, BatchStrideB, BatchStrideB1, c_grid_desc_g_m_n_},
+              c0_matrix_mask_{NRaw},
+              raw_lengths_m_n_k_o_{MRaw, NRaw, KRaw, Gemm1NRaw},
+              c_extent_lowest_{c_gs_ms_gemm1ns_lengths.back()},
+              c_stride_lowest_{c_gs_ms_gemm1ns_strides.back()}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           b1_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        const B1DataType* p_b1_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        CGridDesc_G_M_N c_grid_desc_g_m_n_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        AccElementwiseOperation acc_element_op_;
+        B1ElementwiseOperation b1_element_op_;
+        CElementwiseOperation c_element_op_;
+        index_t batch_count_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+
+        // check C0 masking and padding
+        C0MatrixMask c0_matrix_mask_;
+
+        // For robust IsSupportedArgument() check
+        std::vector<index_t> raw_lengths_m_n_k_o_;
+        index_t c_extent_lowest_;
+        index_t c_stride_lowest_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.b1_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_;
+
+            // Gemm0_K
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel = kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    AccElementwiseOperation,
+                    B1ElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    DeviceOp::B1GridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    ComputeBasePtrOfStridedBatch,
+                    C0MatrixMask,
+                    has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_b1_grid_,
+                                              arg.p_c_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.acc_element_op_,
+                                              arg.b1_element_op_,
+                                              arg.c_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.b1_grid_desc_bk0_n_bk1_,
+                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_ctile_map_,
+                                              arg.batch_count_,
+                                              arg.compute_base_ptr_of_batch_,
+                                              arg.c0_matrix_mask_);
+            };
+
+            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need
+            // to concern Gemm0's loop
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        // Check if C permute dimension matches GEMM + GEMM shape
+        const index_t c_g       = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded
+        const index_t c_m       = arg.c_grid_desc_m_n_.GetLength(I0);
+        const index_t c_gemm1n  = arg.c_grid_desc_m_n_.GetLength(I1);
+        const index_t a_m       = arg.a_grid_desc_ak0_m_ak1_.GetLength(I1);
+        const index_t b1_gemm1n = arg.b1_grid_desc_bk0_n_bk1_.GetLength(I1);
+        if(!(c_g == arg.batch_count_ && c_m == a_m && c_gemm1n == b1_gemm1n))
+        {
+            return false;
+        }
+
+        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
+        // vector is out of bounds
+        const auto MRaw      = arg.raw_lengths_m_n_k_o_[0];
+        const auto NRaw      = arg.raw_lengths_m_n_k_o_[1];
+        const auto KRaw      = arg.raw_lengths_m_n_k_o_[2];
+        const auto Gemm1NRaw = arg.raw_lengths_m_n_k_o_[3];
+
+        // Check scalar per vector requirement
+        const auto a_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, ALayout> ? KRaw : MRaw;
+        const auto b_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, BLayout> ? NRaw : KRaw;
+        const auto b1_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, B1Layout> ? Gemm1NRaw : NRaw;
+        const auto c_extent_lowest = arg.c_extent_lowest_;
+
+        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
+             b_extent_lowest % BBlockTransferSrcScalarPerVector == 0 &&
+             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
+             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
+        {
+            return false;
+        }
+
+        // Check vector store requirement; assumes last dimension in N to be contiguous
+        if(arg.c_stride_lowest_ != 1)
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.b1_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             const B1DataType* p_b1,
+                             CDataType* p_c,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t Gemm1NRaw,
+                             index_t Batch,
+                             std::vector<index_t> c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths
+                             std::vector<index_t> c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideB1,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             index_t BatchStrideB1,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             AccElementwiseOperation acc_element_op,
+                             B1ElementwiseOperation b1_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_b1,
+                        p_c,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        Gemm1NRaw,
+                        Batch,
+                        c_gs_ms_gemm1ns_lengths,
+                        c_gs_ms_gemm1ns_strides,
+                        StrideA,
+                        StrideB,
+                        StrideB1,
+                        BatchStrideA,
+                        BatchStrideB,
+                        BatchStrideB1,
+                        a_element_op,
+                        b_element_op,
+                        acc_element_op,
+                        b1_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    // FIXME: constness
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const void* p_b1,
+                        void* p_c,
+                        index_t MRaw,
+                        index_t NRaw,
+                        index_t KRaw,
+                        index_t Gemm1NRaw,
+                        index_t Batch,
+                        std::vector<index_t> c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths
+                        std::vector<index_t> c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides
+                        index_t StrideA,
+                        index_t StrideB,
+                        index_t StrideB1,
+                        index_t BatchStrideA,
+                        index_t BatchStrideB,
+                        index_t BatchStrideB1,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        AccElementwiseOperation acc_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const B1DataType*>(p_b1),
+                                          static_cast<CDataType*>(p_c),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          Gemm1NRaw,
+                                          Batch,
+                                          c_gs_ms_gemm1ns_lengths,
+                                          c_gs_ms_gemm1ns_strides,
+                                          StrideA,
+                                          StrideB,
+                                          StrideB1,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideB1,
+                                          a_element_op,
+                                          b_element_op,
+                                          acc_element_op,
+                                          b1_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerBlock << ", "
+            << Gemm1NPerBlock << ", "
+            << Gemm1KPerBlock << ", "
+            << B1K1 << ", "
+            << getGemmSpecializationString(GemmSpec) << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename B1GridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2CTileMap,
+          typename ComputeBasePtrOfStridedBatch,
+          typename C0MatrixMask,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatAB* __restrict__ p_b1_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const AccElementwiseOperation acc_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CElementwiseOperation c_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2CTileMap block_2_ctile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
+            const C0MatrixMask c0_matrix_mask)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_b1_grid + b1_batch_offset,
+                                                  p_c_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  acc_element_op,
+                                                  b1_element_op,
+                                                  c_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  b1_grid_desc_bk0_n_bk1,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_ctile_map,
+                                                  c0_matrix_mask);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_b1_grid;
+    ignore = p_c_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = acc_element_op;
+    ignore = b1_element_op;
+    ignore = c_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = b1_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_ctile_map;
+    ignore = batch_count;
+    ignore = compute_base_ptr_of_batch;
+    ignore = c0_matrix_mask;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+// Computes C = A * B0 * B1
+//              ^^^^^^ (Acc0)
+//              ^^^^^^^^^^^ (Acc1)
+
+// When using NPadding as GemmSpecialization, AccElementwiseOperation should be set to
+// ScaleAndResetNaNToMinusInfinity.
+// if !isNan(AccElement)
+//     AccElement *= scale
+// else
+//     AccElement = -INFINITY
+// Otherwise, result may be wrong.
+
+template <typename ALayout,
+          typename BLayout, // B0Layout
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename B1DataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock, // Gemm0NPerBlock
+          index_t KPerBlock, // Gemm0KPerBlock
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t B1K1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1BlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          bool MaskOutUpperTriangle,
+          LoopScheduler LoopSched = LoopScheduler::Default>
+struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
+    : public DeviceBatchedGemmSoftmaxGemm<ALayout,
+                                          BLayout,
+                                          B1Layout,
+                                          CLayout,
+                                          ADataType,
+                                          BDataType,
+                                          B1DataType,
+                                          CDataType,
+                                          AElementwiseOperation,
+                                          BElementwiseOperation,
+                                          AccElementwiseOperation,
+                                          B1ElementwiseOperation,
+                                          CElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto matrix_padder =
+        GemmGemmPadder<GemmSpec, index_t, index_t, index_t, index_t>{
+            MPerBlock, NPerBlock, KPerBlock, Gemm1NPerBlock};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto a_grid_desc_m_k = matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+
+        const auto AK0 = K / AK1;
+
+        return transform_tensor_descriptor(a_grid_desc_m_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                      make_pass_through_transform(M)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto b_grid_desc_n_k = matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = b_grid_desc_n_k.GetLength(I1);
+
+        const auto BK0 = K / BK1;
+
+        return transform_tensor_descriptor(b_grid_desc_n_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                      make_pass_through_transform(N)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // Args: Gemm1KRaw, Gemm1NRaw, StrideB1
+    static auto MakeB1GridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b1_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto b1_grid_desc_n_k = matrix_padder.PadB1Descriptor_N_K(b1_grid_desc_nraw_kraw);
+
+        const auto N = b1_grid_desc_n_k.GetLength(I0);
+        const auto K = b1_grid_desc_n_k.GetLength(I1);
+
+        const auto B1K0 = K / B1K1;
+
+        return transform_tensor_descriptor(
+            b1_grid_desc_n_k,
+            make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)),
+                       make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(c_grid_desc_mraw_nraw);
+    }
+
+    // to track the points which need to be set to -inf on C0
+    // Note: no need to reset M padding value, because they will not be stored out.
+    struct C0MatrixMask
+    {
+        C0MatrixMask(index_t NRaw) : NRaw_(NRaw) {}
+
+        __host__ __device__ bool IsUpperTriangle(index_t m, index_t n) const { return n > m; }
+
+        __host__ __device__ bool IsNOutOfBound(/*index_t m, */ index_t n) const
+        {
+            return n >= NRaw_;
+        }
+
+        __host__ __device__ bool IsMaskedElement(index_t m, index_t n) const
+        {
+            return IsUpperTriangle(m, n) || IsNOutOfBound(n);
+        }
+
+        private:
+        // index_t MRaw_;
+        index_t NRaw_;
+    };
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
+                                     index_t BatchStrideB,
+                                     index_t BatchStrideB1,
+                                     index_t BatchStrideC)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideB1_(BatchStrideB1),
+              BatchStrideC_(BatchStrideC)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB1_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideB1_;
+        index_t BatchStrideC_;
+    };
+
+    using AGridDesc_AK0_M_AK1  = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1  = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using B1GridDesc_BK0_N_BK1 = decltype(MakeB1GridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N        = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        AccElementwiseOperation,
+        B1ElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        B1GridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        Gemm1NPerBlock,
+        Gemm1KPerBlock,
+        AK1,
+        BK1,
+        B1K1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        Gemm1NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        true,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        true,
+        BBlockLdsExtraN,
+        B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B1BlockTransferThreadClusterArrangeOrder,
+        B1BlockTransferSrcAccessOrder,
+        B1BlockTransferSrcVectorDim,
+        B1BlockTransferSrcScalarPerVector,
+        B1BlockTransferDstScalarPerVector_BK1,
+        false,
+        B1BlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched,
+        matrix_padder.PadN,
+        MaskOutUpperTriangle>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 const B1DataType* p_b1_grid,
+                 CDataType* p_c_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t Gemm1NRaw, // = ORaw
+                 index_t Batch,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideB1,
+                 index_t StrideC,
+                 index_t BatchStrideA,
+                 index_t BatchStrideB,
+                 index_t BatchStrideB1,
+                 index_t BatchStrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 AccElementwiseOperation acc_element_op,
+                 B1ElementwiseOperation b1_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_b1_grid_{p_b1_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              b1_grid_desc_bk0_n_bk1_{
+                  DeviceOp::MakeB1GridDescriptor_BK0_N_BK1(NRaw, Gemm1NRaw, StrideB1)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, Gemm1NRaw, StrideC)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              acc_element_op_{acc_element_op},
+              b1_element_op_{b1_element_op},
+              c_element_op_{c_element_op},
+              batch_count_(Batch),
+              compute_base_ptr_of_batch_{BatchStrideA, BatchStrideB, BatchStrideB1, BatchStrideC},
+              c0_matrix_mask_{NRaw},
+              raw_lengths_m_n_k_o_{MRaw, NRaw, KRaw, Gemm1NRaw}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           b1_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        const B1DataType* p_b1_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        AccElementwiseOperation acc_element_op_;
+        B1ElementwiseOperation b1_element_op_;
+        CElementwiseOperation c_element_op_;
+        index_t batch_count_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+
+        // check C0 masking and padding
+        C0MatrixMask c0_matrix_mask_;
+
+        // For robust IsSupportedArgument() check
+        std::vector<index_t> raw_lengths_m_n_k_o_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.b1_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_;
+
+            // Gemm0_K
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel = kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    AccElementwiseOperation,
+                    B1ElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    DeviceOp::B1GridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    ComputeBasePtrOfStridedBatch,
+                    C0MatrixMask,
+                    has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_b1_grid_,
+                                              arg.p_c_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.acc_element_op_,
+                                              arg.b1_element_op_,
+                                              arg.c_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.b1_grid_desc_bk0_n_bk1_,
+                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_ctile_map_,
+                                              arg.batch_count_,
+                                              arg.compute_base_ptr_of_batch_,
+                                              arg.c0_matrix_mask_);
+            };
+
+            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need
+            // to concern Gemm0's loop
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
+        // vector is out of bounds
+        const auto MRaw      = arg.raw_lengths_m_n_k_o_[0];
+        const auto NRaw      = arg.raw_lengths_m_n_k_o_[1];
+        const auto KRaw      = arg.raw_lengths_m_n_k_o_[2];
+        const auto Gemm1NRaw = arg.raw_lengths_m_n_k_o_[3];
+
+        // Check scalar per vector requirement
+        const auto a_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, ALayout> ? KRaw : MRaw;
+        const auto b_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, BLayout> ? NRaw : KRaw;
+        const auto b1_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, B1Layout> ? Gemm1NRaw : NRaw;
+        const auto c_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, CLayout> ? Gemm1NRaw : MRaw;
+
+        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
+             b_extent_lowest % BBlockTransferSrcScalarPerVector == 0 &&
+             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
+             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.b1_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             const B1DataType* p_b1,
+                             CDataType* p_c,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t Gemm1NRaw,
+                             index_t Batch,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideB1,
+                             index_t StrideC,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             index_t BatchStrideB1,
+                             index_t BatchStrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             AccElementwiseOperation acc_element_op,
+                             B1ElementwiseOperation b1_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,           p_b,          p_b1,         p_c,          MRaw,
+                        NRaw,          KRaw,         Gemm1NRaw,    Batch,        StrideA,
+                        StrideB,       StrideB1,     StrideC,      BatchStrideA, BatchStrideB,
+                        BatchStrideB1, BatchStrideC, a_element_op, b_element_op, acc_element_op,
+                        b1_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      const void* p_b1,
+                                                      void* p_c,
+                                                      index_t MRaw,
+                                                      index_t NRaw,
+                                                      index_t KRaw,
+                                                      index_t Gemm1NRaw,
+                                                      index_t Batch,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideB1,
+                                                      index_t StrideC,
+                                                      index_t BatchStrideA,
+                                                      index_t BatchStrideB,
+                                                      index_t BatchStrideB1,
+                                                      index_t BatchStrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      AccElementwiseOperation acc_element_op,
+                                                      B1ElementwiseOperation b1_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const B1DataType*>(p_b1),
+                                          static_cast<CDataType*>(p_c),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          Gemm1NRaw,
+                                          Batch,
+                                          StrideA,
+                                          StrideB,
+                                          StrideB1,
+                                          StrideC,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideB1,
+                                          BatchStrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          acc_element_op,
+                                          b1_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerBlock << ", "
+            << Gemm1NPerBlock << ", "
+            << Gemm1KPerBlock << ", "
+            << B1K1 << ", "
+            << getGemmSpecializationString(GemmSpec) << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2CTileMap Block2CTileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
+ * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
+ * pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2CTileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ComputePtrOffsetOfBatch,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_xdlops_v2r3(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const index_t batch_count,
+            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_c_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a_grid_desc_k0_m_k1,
+                                                  b_grid_desc_k0_n_k1,
+                                                  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = batch_count;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_ctile_map;
+#endif
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector>
+struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
+                                                       BLayout,
+                                                       CLayout,
+                                                       ADataType,
+                                                       BDataType,
+                                                       CDataType,
+                                                       AElementwiseOperation,
+                                                       BElementwiseOperation,
+                                                       CElementwiseOperation>
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto K1Number = Number<K1>{};
+
+    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+
+        const auto a_grid_desc_k0_mp_k1 =
+            transform_tensor_descriptor(a_grid_desc_m_k,
+                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                                                   make_right_pad_transform(M, PadM)),
+                                        make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+        return a_grid_desc_k0_mp_k1;
+    }
+
+    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+        const auto b_grid_desc_k0_np_k1 =
+            transform_tensor_descriptor(b_grid_desc_k_n,
+                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                                                   make_right_pad_transform(N, PadN)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+        return b_grid_desc_k0_np_k1;
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+        const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+        const auto c_grid_desc_mp_np = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return c_grid_desc_mp_np;
+    }
+
+    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
+    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
+    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                       index_t BatchStrideB,
+                                       index_t BatchStrideC)
+            : BatchStrideA_(BatchStrideA), BatchStrideB_(BatchStrideB), BatchStrideC_(BatchStrideC)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideC_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                ADataType, // TODO: distinguish A/B datatype
+                                                AccDataType,
+                                                CDataType,
+                                                InMemoryDataOperationEnum::Set,
+                                                AGridDesc_K0_M_K1,
+                                                BGridDesc_K0_N_K1,
+                                                CGridDesc_M_N,
+                                                AElementwiseOperation,
+                                                BElementwiseOperation,
+                                                CElementwiseOperation,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                K0PerBlock,
+                                                MPerXDL,
+                                                NPerXDL,
+                                                K1,
+                                                MXdlPerWave,
+                                                NXdlPerWave,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                false, // AThreadTransferSrcResetCoordinateAfterRun,
+                                                ABlockLdsAddExtraM,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                false, // BThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockLdsAddExtraN,
+                                                Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector>;
+
+    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
+        decltype(GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
+    using Block2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 index_t M,
+                 index_t N,
+                 index_t K,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 index_t BatchStrideA,
+                 index_t BatchStrideB,
+                 index_t BatchStrideC,
+                 index_t Batch,
+                 index_t M01,
+                 index_t N01,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              Batch_(Batch),
+              a_grid_desc_k0_m_k1_{
+                  DeviceBatchedGemmXdl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA)},
+              b_grid_desc_k0_n_k1_{
+                  DeviceBatchedGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB)},
+              c_grid_desc_m_n_{DeviceBatchedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC)},
+              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
+              compute_ptr_offset_of_batch_{BatchStrideA, BatchStrideB, BatchStrideC},
+              block_2_ctile_map_{
+                  GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01)},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
+                                           b_grid_desc_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        index_t Batch_;
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
+        Block2CTileMap block_2_ctile_map_;
+        index_t M01_;
+        index_t N01_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceBatchedGemmXdl::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            {
+                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
+                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
+                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{" << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                            arg.b_grid_desc_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseBatchedGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.Batch_;
+
+            const auto K =
+                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_batched_gemm_xdlops_v2r3<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    ComputePtrOffsetOfStridedBatch,
+                    remove_reference_t<Block2CTileMap>,
+                    true>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.Batch_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.compute_ptr_offset_of_batch_,
+                                                  arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_batched_gemm_xdlops_v2r3<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    ComputePtrOffsetOfStridedBatch,
+                    remove_reference_t<Block2CTileMap>,
+                    false>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.Batch_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.compute_ptr_offset_of_batch_,
+                                                  arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                           arg.b_grid_desc_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             index_t BatchStrideC,
+                             index_t Batch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        BatchStrideA,
+                        BatchStrideB,
+                        BatchStrideC,
+                        Batch,
+                        1,
+                        1,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      index_t BatchStrideA,
+                                                      index_t BatchStrideB,
+                                                      index_t BatchStrideC,
+                                                      index_t Batch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideC,
+                                          Batch,
+                                          1,
+                                          1,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmXdl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck