formatting

ed305f6b · Umang Yadav · 9f4e3544 · ed305f6b · ed305f6b · ed305f6b
Commit ed305f6b authored Sep 28, 2023 by Umang Yadav
20 changed files
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -50,9 +50,9 @@
 #define CK_BUFFER_RESOURCE_3RD_DWORD -1
 #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) ||                          \
-    defined(__gfx942__)                                                    // for GPU code
+    defined(__gfx942__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
-#elif defined(__gfx1030__)                                                 // for GPU code
+#elif defined(__gfx1030__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
 #elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000
@@ -86,7 +86,7 @@
 #endif

 // WMMA instruction
-#ifndef __HIP_DEVICE_COMPILE__                                             // for host code
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_WMMA
 #elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
 #define CK_USE_AMD_WMMA
@@ -107,7 +107,7 @@
 #elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
    defined(__gfx942__) // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
-#else                   // for GPU code
+#else // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
 #endif


--- a/include/ck/tensor_description/tensor_adaptor.hpp
+++ b/include/ck/tensor_description/tensor_adaptor.hpp
@@ -108,13 +108,13 @@ struct TensorAdaptor

    __host__ __device__ static constexpr index_t GetNumOfHiddenDimension()
    {
-        constexpr auto all_low_dim_ids =
-            unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); },
-                   LowerDimensionHiddenIdss{});
+        constexpr auto all_low_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
+            LowerDimensionHiddenIdss{});

-        constexpr auto all_up_dim_ids =
-            unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); },
-                   UpperDimensionHiddenIdss{});
+        constexpr auto all_up_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
+            UpperDimensionHiddenIdss{});

        constexpr auto all_dim_ids = merge_sequences(all_low_dim_ids, all_up_dim_ids);

@@ -338,7 +338,8 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                TensorAdaptor1::GetLowerDimensionHiddenIdss()[itran];

            // sequence in, sequence out
-            constexpr auto low_dim_hidden_ids_1_mod = [&]() constexpr {
+            constexpr auto low_dim_hidden_ids_1_mod = [&]() constexpr
+            {
                auto low_dim_hidden_ids_1_mod_ = to_multi_index(low_dim_hidden_ids_1);

                // shift hidden id so every dim id is unique
@@ -360,7 +361,8 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                });

                return low_dim_hidden_ids_1_mod_;
-            }();
+            }
+            ();

            return generate_sequence_v2(
                [&](auto i) constexpr { return Number<low_dim_hidden_ids_1_mod[i]>{}; },
@@ -382,7 +384,8 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                TensorAdaptor1::GetUpperDimensionHiddenIdss()[itran];

            // sequence in, constexpr tuple out
-            constexpr auto up_dim_hidden_ids_1_mod = [&]() constexpr {
+            constexpr auto up_dim_hidden_ids_1_mod = [&]() constexpr
+            {
                auto up_dim_hidden_ids_1_mod_ = to_multi_index(up_dim_hidden_ids_1);

                // shift hidden id
@@ -391,7 +394,8 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                });

                return up_dim_hidden_ids_1_mod_;
-            }();
+            }
+            ();

            // constexpr tuple to sequence
            return generate_sequence_v2(

--- a/include/ck/tensor_description/tensor_space_filling_curve.hpp
+++ b/include/ck/tensor_description/tensor_space_filling_curve.hpp
@@ -94,8 +94,10 @@ struct SpaceFillingCurve
        // Given tensor strides \p access_lengths, and 1D index of space-filling-curve, compute the
        // idim-th element of multidimensional index.
        // All constexpr variables have to be captured by VALUE.
-        constexpr auto compute_index = [idx_1d, access_strides](auto idim) constexpr {
-            constexpr auto compute_index_impl = [idx_1d, access_strides](auto jdim) constexpr {
+        constexpr auto compute_index = [ idx_1d, access_strides ](auto idim) constexpr
+        {
+            constexpr auto compute_index_impl = [ idx_1d, access_strides ](auto jdim) constexpr
+            {
                auto res = idx_1d.value;
                auto id  = 0;


--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -14,8 +14,8 @@ namespace device {

 struct BaseArgument
 {
-    BaseArgument()                               = default;
-    BaseArgument(const BaseArgument&)            = default;
+    BaseArgument()                    = default;
+    BaseArgument(const BaseArgument&) = default;
    BaseArgument& operator=(const BaseArgument&) = default;

    virtual ~BaseArgument() {}
@@ -26,8 +26,8 @@ struct BaseArgument
 #ifndef __HIPCC_RTC__
 struct BaseInvoker
 {
-    BaseInvoker()                              = default;
-    BaseInvoker(const BaseInvoker&)            = default;
+    BaseInvoker()                   = default;
+    BaseInvoker(const BaseInvoker&) = default;
    BaseInvoker& operator=(const BaseInvoker&) = default;

    virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{})
@@ -41,13 +41,12 @@ struct BaseInvoker

 struct BaseOperator
 {
-    BaseOperator()                               = default;
-    BaseOperator(const BaseOperator&)            = default;
+    BaseOperator()                    = default;
+    BaseOperator(const BaseOperator&) = default;
    BaseOperator& operator=(const BaseOperator&) = default;

    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }

-
 #ifndef __HIPCC_RTC__
    virtual std::string GetTypeString() const { return ""; }

@@ -66,7 +65,7 @@ struct BaseOperator

    virtual void SetWorkSpacePointer(BaseArgument* p_arg, void* p_workspace) const
    {
-        //assert(p_arg);
+        // assert(p_arg);
        p_arg->p_workspace_ = p_workspace;
    }


--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -38,25 +38,25 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_contraction_multiple_d_xdl_cshuffle(
-        const FloatAB* __restrict__ p_a_grid,
-        const FloatAB* __restrict__ p_b_grid,
-        FloatDsPointer p_ds_grid,
-        FloatE* __restrict__ p_e_grid,
-        const index_t batch_count,
-        const AElementwiseOperation a_element_op,
-        const BElementwiseOperation b_element_op,
-        const CDEElementwiseOperation cde_element_op,
-        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            ds_grid_desc_mblock_mperblock_nblock_nperblock,
-        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            e_grid_desc_mblock_mperblock_nblock_nperblock,
-        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-        const Block2ETileMap block_2_etile_map)
+        kernel_contraction_multiple_d_xdl_cshuffle(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatDsPointer p_ds_grid,
+            FloatE* __restrict__ p_e_grid,
+            const index_t batch_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
@@ -60,21 +60,21 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_batched_gemm_e_permute_xdl(const ABDataType* __restrict__ p_a_grid,
-                                      const ABDataType* __restrict__ p_b_grid,
-                                      EDataType* __restrict__ p_e_grid,
-                                      const index_t batch_count,
-                                      const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-                                      const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-                                      const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                                          e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                      const AElementwiseOperation a_element_op,
-                                      const BElementwiseOperation b_element_op,
-                                      const CDEElementwiseOperation cde_element_op,
-                                      const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-                                      const Block2ETileMap block_2_etile_map)
+        kernel_batched_gemm_e_permute_xdl(const ABDataType* __restrict__ p_a_grid,
+                                          const ABDataType* __restrict__ p_b_grid,
+                                          EDataType* __restrict__ p_e_grid,
+                                          const index_t batch_count,
+                                          const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                                          const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                                          const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                              e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                          const AElementwiseOperation a_element_op,
+                                          const BElementwiseOperation b_element_op,
+                                          const CDEElementwiseOperation cde_element_op,
+                                          const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+                                          const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -41,25 +41,26 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_gemm_gemm_xdl_cshuffle_v1(const FloatAB* __restrict__ p_a_grid,
-                                     const FloatAB* __restrict__ p_b_grid,
-                                     const FloatAB* __restrict__ p_b1_grid,
-                                     FloatC* __restrict__ p_c_grid,
-                                     const AElementwiseOperation a_element_op,
-                                     const BElementwiseOperation b_element_op,
-                                     const AccElementwiseOperation acc_element_op,
-                                     const B1ElementwiseOperation b1_element_op,
-                                     const CElementwiseOperation c_element_op,
-                                     const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-                                     const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-                                     const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
-                                     const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                         c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                     const Block2CTileMap block_2_ctile_map,
-                                     const index_t batch_count,
-                                     const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
+        kernel_gemm_gemm_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatAB* __restrict__ p_b1_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const AccElementwiseOperation acc_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CElementwiseOperation c_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2CTileMap block_2_ctile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
@@ -63,24 +63,24 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_batched_gemm_xdl(const ABDataType* __restrict__ p_a_grid,
-                            const ABDataType* __restrict__ p_b_grid,
-                            DsPointer p_ds_grid,
-                            EDataType* __restrict__ p_e_grid,
-                            const index_t batch_count,
-                            const AElementwiseOperation a_element_op,
-                            const BElementwiseOperation b_element_op,
-                            const CDEElementwiseOperation cde_element_op,
-                            const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
-                            const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
-                            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                                e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-                            const Block2ETileMap block_2_etile_map)
+        kernel_batched_gemm_xdl(const ABDataType* __restrict__ p_a_grid,
+                                const ABDataType* __restrict__ p_b_grid,
+                                DsPointer p_ds_grid,
+                                EDataType* __restrict__ p_e_grid,
+                                const index_t batch_count,
+                                const AElementwiseOperation a_element_op,
+                                const BElementwiseOperation b_element_op,
+                                const CDEElementwiseOperation cde_element_op,
+                                const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
+                                const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
+                                const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                    ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                    e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+                                const Block2ETileMap block_2_etile_map)
 {

 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
@@ -52,23 +52,23 @@ template <typename GridwiseGemm,
          bool HasDoubleTailKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_gemm_dl_multiple_d(
-        const ABDataType* __restrict__ p_a_grid,
-        const ABDataType* __restrict__ p_b_grid,
-        DsPointer p_ds_grid,
-        EDataType* __restrict__ p_e_grid,
-        const index_t batch_count,
-        const AElementwiseOperation a_element_op,
-        const BElementwiseOperation b_element_op,
-        const CDEElementwiseOperation cde_element_op,
-        const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
-        const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
-        const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
-        const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
-        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-        const Block2CTileMap block_2_ctile_map)
+        kernel_gemm_dl_multiple_d(
+            const ABDataType* __restrict__ p_a_grid,
+            const ABDataType* __restrict__ p_b_grid,
+            DsPointer p_ds_grid,
+            EDataType* __restrict__ p_e_grid,
+            const index_t batch_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+            const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
+            const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||             \
    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -41,32 +41,32 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_batched_gemm_gemm_xdl_cshuffle_v1(
-        const A0B0B1DataType* __restrict__ p_a0_grid,
-        const A0B0B1DataType* __restrict__ p_b0_grid,
-        D0sPointer p_d0s_grid,
-        const A0B0B1DataType* __restrict__ p_b1_grid,
-        D1sPointer p_d1s_grid,
-        E1DataType* __restrict__ p_e1_grid,
-        const A0ElementwiseOperation a0_element_op,
-        const B0ElementwiseOperation b0_element_op,
-        const CDE0ElementwiseOperation cde0_element_op,
-        const B1ElementwiseOperation b1_element_op,
-        const CDE1ElementwiseOperation cde1_element_op,
-        const A0GridDesc_AK0_M_AK1 a0_grid_desc_ak0_m_ak1,
-        const B0GridDesc_BK0_N_BK1 b0_grid_desc_bk0_n_bk1,
-        const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
-            d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-        const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
-        const D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            d1s_grid_desc_mblock_mperblock_nblock_nperblock,
-        const E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            e1_grid_desc_mblock_mperblock_nblock_nperblock,
-        const Block2E1TileMap block_2_e1tile_map,
-        const index_t batch_count,
-        const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
+        kernel_batched_gemm_gemm_xdl_cshuffle_v1(
+            const A0B0B1DataType* __restrict__ p_a0_grid,
+            const A0B0B1DataType* __restrict__ p_b0_grid,
+            D0sPointer p_d0s_grid,
+            const A0B0B1DataType* __restrict__ p_b1_grid,
+            D1sPointer p_d1s_grid,
+            E1DataType* __restrict__ p_e1_grid,
+            const A0ElementwiseOperation a0_element_op,
+            const B0ElementwiseOperation b0_element_op,
+            const CDE0ElementwiseOperation cde0_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CDE1ElementwiseOperation cde1_element_op,
+            const A0GridDesc_AK0_M_AK1 a0_grid_desc_ak0_m_ak1,
+            const B0GridDesc_BK0_N_BK1 b0_grid_desc_bk0_n_bk1,
+            const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
+                d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                d1s_grid_desc_mblock_mperblock_nblock_nperblock,
+            const E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e1_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2E1TileMap block_2_e1tile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -38,26 +38,26 @@ template <typename GridwiseGemm,
          bool HasMainK0BlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_batched_gemm_reduce_xdl_cshuffle_v1(
-        const FloatAB* __restrict__ p_a_grid,
-        const FloatAB* __restrict__ p_b_grid,
-        FloatC* __restrict__ p_c_grid,
-        ReducePtrsGlobal p_reduces_grid,
-        const index_t batch_count,
-        const AElementwiseOperation a_element_op,
-        const BElementwiseOperation b_element_op,
-        const CElementwiseOperation c_element_op,
-        const ReduceInElementwiseOperations reduce_in_element_ops,
-        const ReduceAccElementwiseOperations reduce_out_element_ops,
-        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock,
-        const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
-        const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
-        const Block2CTileMap block_2_ctile_map)
+        kernel_batched_gemm_reduce_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            ReducePtrsGlobal p_reduces_grid,
+            const index_t batch_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const ReduceInElementwiseOperations reduce_in_element_ops,
+            const ReduceAccElementwiseOperations reduce_out_element_ops,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
+            const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
+            const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -42,30 +42,30 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
-        const FloatAB* __restrict__ p_a_grid,
-        const FloatAB* __restrict__ p_b_grid,
-        const FloatAB* __restrict__ p_b1_grid,
-        FloatC* __restrict__ p_c_grid,
-        D0sPointer p_d0s_grid,
-        const AElementwiseOperation a_element_op,
-        const BElementwiseOperation b_element_op,
-        const C0DEElementwiseOperation c0de_element_op,
-        const B1ElementwiseOperation b1_element_op,
-        const C1DEElementwiseOperation c1de_element_op,
-        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-        const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
-        const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c1_grid_desc_mblock_mperblock_nblock_nperblock,
-        const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
-            d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-        const Block2CTileMap block_2_ctile_map,
-        const index_t batch_count,
-        const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
-        const C0MatrixMask c0_matrix_mask)
+        kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatAB* __restrict__ p_b1_grid,
+            FloatC* __restrict__ p_c_grid,
+            D0sPointer p_d0s_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const C0DEElementwiseOperation c0de_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const C1DEElementwiseOperation c1de_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c1_grid_desc_mblock_mperblock_nblock_nperblock,
+            const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
+                d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+            const Block2CTileMap block_2_ctile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
+            const C0MatrixMask c0_matrix_mask)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -40,27 +40,27 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
-        const FloatAB* __restrict__ p_a_grid,
-        const FloatAB* __restrict__ p_b_grid,
-        const FloatAB* __restrict__ p_b1_grid,
-        FloatC* __restrict__ p_c_grid,
-        const AElementwiseOperation a_element_op,
-        const BElementwiseOperation b_element_op,
-        const AccElementwiseOperation acc_element_op,
-        const B1ElementwiseOperation b1_element_op,
-        const CElementwiseOperation c_element_op,
-        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-        const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
-        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock,
-        const Block2CTileMap block_2_ctile_map,
-        const index_t batch_count,
-        const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
-        const C0MatrixMask c0_matrix_mask)
+        kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatAB* __restrict__ p_b1_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const AccElementwiseOperation acc_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CElementwiseOperation c_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2CTileMap block_2_ctile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
+            const C0MatrixMask c0_matrix_mask)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
@@ -611,7 +611,8 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
        return true;
    }

-    static constexpr bool IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_, index_t Gemm1NRaw_)
+    static constexpr bool
+    IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_, index_t Gemm1NRaw_)
    {
        // check vector load/store
        using Row = ck::tensor_layout::gemm::RowMajor;
@@ -842,7 +843,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
    template <class ADesc, class BDesc, class B1Desc, class CDesc>
    struct Descriptor
    {
-        template<class AGridDescriptor>
+        template <class AGridDescriptor>
        static constexpr auto MakeAGridDescriptor_AK0_M_AK1(const AGridDescriptor& a_grid_desc)
        {
            const auto a_grid_desc_m_k = DeviceOp::matrix_padder.PadADescriptor_M_K(a_grid_desc);
@@ -852,14 +853,15 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle

            const auto AK0 = K / AK1;

-            return transform_tensor_descriptor(a_grid_desc_m_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                        make_pass_through_transform(M)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
        }

-        template<class BGridDescriptor> 
+        template <class BGridDescriptor>
        static constexpr auto MakeBGridDescriptor_BK0_N_BK1(const BGridDescriptor& b_grid_desc)
        {
            const auto b_grid_desc_n_k = DeviceOp::matrix_padder.PadBDescriptor_N_K(b_grid_desc);
@@ -869,14 +871,15 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle

            const auto BK0 = K / BK1;

-            return transform_tensor_descriptor(b_grid_desc_n_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                        make_pass_through_transform(N)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
        }

-        template<class B1GridDescriptor>
+        template <class B1GridDescriptor>
        static constexpr auto MakeB1GridDescriptor_BK0_N_BK1(const B1GridDescriptor& b1_grid_desc)
        {
            const auto b1_grid_desc_n_k = DeviceOp::matrix_padder.PadB1Descriptor_N_K(b1_grid_desc);
@@ -889,26 +892,24 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
            return transform_tensor_descriptor(
                b1_grid_desc_n_k,
                make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)),
-                        make_pass_through_transform(N)),
+                           make_pass_through_transform(N)),
                make_tuple(Sequence<1>{}, Sequence<0>{}),
                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
        }

-        template<class CGridDescriptor>
+        template <class CGridDescriptor>
        static constexpr auto MakeCGridDescriptor_M_N(const CGridDescriptor& c_grid_desc)
        {
            return DeviceOp::matrix_padder.PadCDescriptor_M_N(c_grid_desc);
        }

-
        using AGridDesc_AK0_M_AK1 =
            remove_cvref_t<decltype(MakeAGridDescriptor_AK0_M_AK1(ADesc{}))>;
        using BGridDesc_BK0_N_BK1 =
            remove_cvref_t<decltype(MakeBGridDescriptor_BK0_N_BK1(BDesc{}))>;
        using B1GridDesc_BK0_N_BK1 =
            remove_cvref_t<decltype(MakeB1GridDescriptor_BK0_N_BK1(B1Desc{}))>;
-        using CGridDesc_M_N = 
-            remove_cvref_t<decltype(MakeCGridDescriptor_M_N(CDesc{}))>;
+        using CGridDesc_M_N = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(CDesc{}))>;

        // GridwiseGemm
        using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
@@ -979,8 +980,9 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
        CGridDesc_M_N c_grid_desc_m_n;
        C0MatrixMask c0_matrix_mask;
        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_descriptor_mblock_mperblock_nblock_nperblock;
-        
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_descriptor_mblock_mperblock_nblock_nperblock;
+
        // element-wise op
        AElementwiseOperation a_element_op;
        BElementwiseOperation b_element_op;
@@ -1002,10 +1004,10 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
              b_grid_desc_bk0_n_bk1{MakeBGridDescriptor_BK0_N_BK1(b)},
              b1_grid_desc_bk0_n_bk1{MakeB1GridDescriptor_BK0_N_BK1(b1)},
              c_grid_desc_m_n{MakeCGridDescriptor_M_N(c)},
-              block_2_ctile_map{GridwiseGemm::MakeDefaultBlock2CTileMap(
-                c_grid_desc_m_n)},
+              block_2_ctile_map{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n)},
              c_grid_descriptor_mblock_mperblock_nblock_nperblock{
-                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n)},
+                  GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                      c_grid_desc_m_n)},
              has_main_k_block_loop{GridwiseGemm::CalculateHasMainKBlockLoop(
                  a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2))},
              c0_matrix_mask{c.GetLength(I1)},
@@ -1013,23 +1015,20 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
              b_element_op{b_element_op_},
              b1_element_op{b1_element_op_},
              c_element_op{c_element_op_},
-              is_valid{GridwiseGemm::CheckValidity(
-                           a_grid_desc_ak0_m_ak1,
-                           b_grid_desc_bk0_n_bk1,
-                           b1_grid_desc_bk0_n_bk1,
-                           c_grid_desc_m_n,
-                           block_2_ctile_map) and 
-                        IsSupported(a_grid_desc_ak0_m_ak1.GetLength(I1), 
+              is_valid{GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1,
+                                                   b_grid_desc_bk0_n_bk1,
+                                                   b1_grid_desc_bk0_n_bk1,
+                                                   c_grid_desc_m_n,
+                                                   block_2_ctile_map) and
+                       IsSupported(a_grid_desc_ak0_m_ak1.GetLength(I1),
                                   b_grid_desc_bk0_n_bk1.GetLength(I1),
-                                   a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2),
+                                   a_grid_desc_ak0_m_ak1.GetLength(I0) *
+                                       a_grid_desc_ak0_m_ak1.GetLength(I2),
                                   b1_grid_desc_bk0_n_bk1.GetLength(I1))}
        {
        }

-        constexpr bool IsValid() const
-        {
-            return is_valid;
-        }
+        constexpr bool IsValid() const { return is_valid; }
    };

    template <class ADesc, class BDesc, class B1Desc, class CDesc>
@@ -1038,10 +1037,10 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
                    BDesc b,
                    B1Desc b1,
                    CDesc c,
-                    AElementwiseOperation a_element_op     = AElementwiseOperation{},
-                    BElementwiseOperation b_element_op     = BElementwiseOperation{},
-                    B1ElementwiseOperation b1_element_op   = B1ElementwiseOperation{},
-                    CElementwiseOperation c_element_op     = CElementwiseOperation{})
+                    AElementwiseOperation a_element_op   = AElementwiseOperation{},
+                    BElementwiseOperation b_element_op   = BElementwiseOperation{},
+                    B1ElementwiseOperation b1_element_op = B1ElementwiseOperation{},
+                    CElementwiseOperation c_element_op   = CElementwiseOperation{})
    {
        return Descriptor<ADesc, BDesc, B1Desc, CDesc>(
            a, b, b1, c, a_element_op, b_element_op, b1_element_op, c_element_op);
@@ -1061,41 +1060,43 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle

        if(desc.has_main_k_block_loop)
        {
-            Desc::GridwiseGemm::template Run<true>(p_a_grid,
-                                             p_b_grid,
-                                             p_b1_grid,
-                                             p_c_grid,
-                                             p_shared_block,
-                                             desc.a_element_op,
-                                             desc.b_element_op,
-                                             acc_element_op,
-                                             desc.b1_element_op,
-                                             desc.c_element_op,
-                                             desc.a_grid_desc_ak0_m_ak1,
-                                             desc.b_grid_desc_bk0_n_bk1,
-                                             desc.b1_grid_desc_bk0_n_bk1,
-                                             desc.c_grid_descriptor_mblock_mperblock_nblock_nperblock,
-                                             desc.block_2_ctile_map,
-                                             desc.c0_matrix_mask);
+            Desc::GridwiseGemm::template Run<true>(
+                p_a_grid,
+                p_b_grid,
+                p_b1_grid,
+                p_c_grid,
+                p_shared_block,
+                desc.a_element_op,
+                desc.b_element_op,
+                acc_element_op,
+                desc.b1_element_op,
+                desc.c_element_op,
+                desc.a_grid_desc_ak0_m_ak1,
+                desc.b_grid_desc_bk0_n_bk1,
+                desc.b1_grid_desc_bk0_n_bk1,
+                desc.c_grid_descriptor_mblock_mperblock_nblock_nperblock,
+                desc.block_2_ctile_map,
+                desc.c0_matrix_mask);
        }
        else
        {
-            Desc::GridwiseGemm::template Run<false>(p_a_grid,
-                                              p_b_grid,
-                                              p_b1_grid,
-                                              p_c_grid,
-                                              p_shared_block,
-                                              desc.a_element_op,
-                                              desc.b_element_op,
-                                              acc_element_op,
-                                              desc.b1_element_op,
-                                              desc.c_element_op,
-                                              desc.a_grid_desc_ak0_m_ak1,
-                                              desc.b_grid_desc_bk0_n_bk1,
-                                              desc.b1_grid_desc_bk0_n_bk1,
-                                              desc.c_grid_descriptor_mblock_mperblock_nblock_nperblock,
-                                              desc.block_2_ctile_map,
-                                              desc.c0_matrix_mask);
+            Desc::GridwiseGemm::template Run<false>(
+                p_a_grid,
+                p_b_grid,
+                p_b1_grid,
+                p_c_grid,
+                p_shared_block,
+                desc.a_element_op,
+                desc.b_element_op,
+                acc_element_op,
+                desc.b1_element_op,
+                desc.c_element_op,
+                desc.a_grid_desc_ak0_m_ak1,
+                desc.b_grid_desc_bk0_n_bk1,
+                desc.b1_grid_desc_bk0_n_bk1,
+                desc.c_grid_descriptor_mblock_mperblock_nblock_nperblock,
+                desc.block_2_ctile_map,
+                desc.c0_matrix_mask);
        }
    }
 };

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -48,9 +48,9 @@ namespace device {
 template <typename DeviceOp, typename GridwiseGemm, bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_batched_gemm_xdlops_v2r3(const typename DeviceOp::Argument karg)
+        kernel_batched_gemm_xdlops_v2r3(const typename DeviceOp::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -34,23 +34,23 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_contraction_multiple_d_xdl_cshuffle(
-        const FloatAB* __restrict__ p_a_grid,
-        const FloatAB* __restrict__ p_b_grid,
-        FloatDsPointer p_ds_grid,
-        FloatE* __restrict__ p_e_grid,
-        const AElementwiseOperation a_element_op,
-        const BElementwiseOperation b_element_op,
-        const CDEElementwiseOperation cde_element_op,
-        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            ds_grid_desc_mblock_mperblock_nblock_nperblock,
-        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            e_grid_desc_mblock_mperblock_nblock_nperblock,
-        const Block2ETileMap block_2_etile_map)
+        kernel_contraction_multiple_d_xdl_cshuffle(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatDsPointer p_ds_grid,
+            FloatE* __restrict__ p_e_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -404,7 +404,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
        BBlockTransferSrcVectorDim,
        BBlockTransferSrcScalarPerVector,
        BBlockTransferDstScalarPerVector_K1,
-        false,                            // BThreadTransferSrcResetCoordinateAfterRun,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
        BBlockLdsAddExtraN,
        Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder,
        7,                                // CThreadTransferSrcDstVectorDim,

--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -436,7 +436,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
        BlockSize,
        ABDataType, // TODO: distinguish A/B datatype
        AccDataType,
-        CDataType,  // TODO: Add ShuffleType for DeviceConv2d
+        CDataType, // TODO: Add ShuffleType for DeviceConv2d
        CDataType,
        InMemoryDataOperationEnum::Set,
        AGridDesc_K0_M_K1,

--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -354,7 +354,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
        2,                 // BBlockTransferSrcVectorDim,
        BBlockTransferSrcScalarPerVector,
        BBlockTransferDstScalarPerVector_K1,
-        false,                            // BThreadTransferSrcResetCoordinateAfterRun,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
        BBlockLdsAddExtraN,
        Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder,
        7,                                // CThreadTransferSrcDstVectorDim,

--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -37,23 +37,23 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_gemm_xdlops_v2r3_for_conv3d(
-        const FloatAB* __restrict__ p_a_grid,
-        const FloatAB* __restrict__ p_b_grid,
-        FloatC* __restrict__ p_c_grid,
-        const index_t num_batches,
-        const index_t a_batch_stride,
-        const index_t b_batch_stride,
-        const index_t c_batch_stride,
-        const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
-        const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
-        const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-        const AElementwiseOperation a_element_op,
-        const BElementwiseOperation b_element_op,
-        const CElementwiseOperation c_element_op,
-        const Block2CTileMap block_2_ctile_map)
+        kernel_gemm_xdlops_v2r3_for_conv3d(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const index_t num_batches,
+            const index_t a_batch_stride,
+            const index_t b_batch_stride,
+            const index_t c_batch_stride,
+            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
@@ -1005,7 +1005,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
        BBlockTransferSrcVectorDim,
        BBlockTransferSrcScalarPerVector,
        BBlockTransferDstScalarPerVector_K1,
-        false,                            // BThreadTransferSrcResetCoordinateAfterRun,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
        BBlockLdsAddExtraN,
        Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder,
        7,                                // CThreadTransferSrcDstVectorDim,